1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10 11; -------------------------------------------------------------------- 12; float 13; -------------------------------------------------------------------- 14 15define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 16; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 17; GFX12: ; %bb.0: 18; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19; GFX12-NEXT: s_wait_expcnt 0x0 20; GFX12-NEXT: s_wait_samplecnt 0x0 21; GFX12-NEXT: s_wait_bvhcnt 0x0 22; GFX12-NEXT: s_wait_kmcnt 0x0 23; GFX12-NEXT: s_wait_storecnt 0x0 24; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 25; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 26; GFX12-NEXT: global_inv scope:SCOPE_DEV 27; GFX12-NEXT: s_setpc_b64 s[30:31] 28; 29; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 30; GFX940: ; %bb.0: 31; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX940-NEXT: buffer_wbl2 sc1 33; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 34; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 35; GFX940-NEXT: buffer_inv sc1 36; GFX940-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 39; GFX11: ; %bb.0: 40; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 42; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 43; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 44; GFX11-NEXT: buffer_gl1_inv 45; GFX11-NEXT: buffer_gl0_inv 46; GFX11-NEXT: s_setpc_b64 s[30:31] 47; 48; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10-NEXT: flat_load_dword v3, v[0:1] 52; GFX10-NEXT: s_mov_b32 s4, 0 53; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start 54; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 55; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 56; GFX10-NEXT: v_mov_b32_e32 v4, v3 57; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 58; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 59; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 60; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 61; GFX10-NEXT: buffer_gl1_inv 62; GFX10-NEXT: buffer_gl0_inv 63; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 64; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 65; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 66; GFX10-NEXT: s_cbranch_execnz .LBB0_1 67; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 68; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 69; GFX10-NEXT: v_mov_b32_e32 v0, v3 70; GFX10-NEXT: s_setpc_b64 s[30:31] 71; 72; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 73; GFX90A: ; %bb.0: 74; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 76; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 77; GFX90A-NEXT: ; implicit-def: $vgpr3 78; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 79; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 80; GFX90A-NEXT: s_cbranch_execz .LBB0_6 81; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private 82; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 83; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 84; GFX90A-NEXT: ; implicit-def: $vgpr3 85; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 86; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 87; GFX90A-NEXT: s_cbranch_execz .LBB0_3 88; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global 89; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc 90; GFX90A-NEXT: s_waitcnt vmcnt(0) 91; GFX90A-NEXT: buffer_wbinvl1 92; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 93; GFX90A-NEXT: ; implicit-def: $vgpr2 94; GFX90A-NEXT: .LBB0_3: ; %Flow 95; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 96; GFX90A-NEXT: s_cbranch_execz .LBB0_5 97; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private 98; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 99; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 100; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 101; GFX90A-NEXT: s_waitcnt vmcnt(0) 102; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 103; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 104; GFX90A-NEXT: .LBB0_5: ; %Flow1 105; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 106; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 107; GFX90A-NEXT: ; implicit-def: $vgpr2 108; GFX90A-NEXT: .LBB0_6: ; %Flow2 109; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 110; GFX90A-NEXT: s_cbranch_execz .LBB0_8 111; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared 112; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 113; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 114; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 115; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 116; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.phi 117; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 118; GFX90A-NEXT: v_mov_b32_e32 v0, v3 119; GFX90A-NEXT: s_waitcnt vmcnt(0) 120; GFX90A-NEXT: s_setpc_b64 s[30:31] 121; 122; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 123; GFX908: ; %bb.0: 124; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 125; GFX908-NEXT: flat_load_dword v3, v[0:1] 126; GFX908-NEXT: s_mov_b64 s[4:5], 0 127; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 128; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 129; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 130; GFX908-NEXT: v_mov_b32_e32 v4, v3 131; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 132; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 133; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX908-NEXT: buffer_wbinvl1 135; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 136; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 137; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 138; GFX908-NEXT: s_cbranch_execnz .LBB0_1 139; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 140; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 141; GFX908-NEXT: v_mov_b32_e32 v0, v3 142; GFX908-NEXT: s_setpc_b64 s[30:31] 143; 144; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 145; GFX8: ; %bb.0: 146; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GFX8-NEXT: flat_load_dword v3, v[0:1] 148; GFX8-NEXT: s_mov_b64 s[4:5], 0 149; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start 150; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 151; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX8-NEXT: v_mov_b32_e32 v4, v3 153; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 154; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 155; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX8-NEXT: buffer_wbinvl1 157; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 158; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 159; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 160; GFX8-NEXT: s_cbranch_execnz .LBB0_1 161; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 162; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 163; GFX8-NEXT: v_mov_b32_e32 v0, v3 164; GFX8-NEXT: s_setpc_b64 s[30:31] 165; 166; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 167; GFX7: ; %bb.0: 168; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; GFX7-NEXT: flat_load_dword v3, v[0:1] 170; GFX7-NEXT: s_mov_b64 s[4:5], 0 171; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start 172; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 173; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 174; GFX7-NEXT: v_mov_b32_e32 v4, v3 175; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 176; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 177; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 178; GFX7-NEXT: buffer_wbinvl1 179; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 180; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 181; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 182; GFX7-NEXT: s_cbranch_execnz .LBB0_1 183; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 184; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 185; GFX7-NEXT: v_mov_b32_e32 v0, v3 186; GFX7-NEXT: s_setpc_b64 s[30:31] 187 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 188 ret float %result 189} 190 191define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 192; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 193; GFX12: ; %bb.0: 194; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 195; GFX12-NEXT: s_wait_expcnt 0x0 196; GFX12-NEXT: s_wait_samplecnt 0x0 197; GFX12-NEXT: s_wait_bvhcnt 0x0 198; GFX12-NEXT: s_wait_kmcnt 0x0 199; GFX12-NEXT: s_wait_storecnt 0x0 200; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 201; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 202; GFX12-NEXT: global_inv scope:SCOPE_DEV 203; GFX12-NEXT: s_setpc_b64 s[30:31] 204; 205; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 206; GFX940: ; %bb.0: 207; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; GFX940-NEXT: buffer_wbl2 sc1 209; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 210; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 211; GFX940-NEXT: buffer_inv sc1 212; GFX940-NEXT: s_setpc_b64 s[30:31] 213; 214; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 215; GFX11: ; %bb.0: 216; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 218; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc 219; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 220; GFX11-NEXT: buffer_gl1_inv 221; GFX11-NEXT: buffer_gl0_inv 222; GFX11-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 225; GFX10: ; %bb.0: 226; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 228; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 229; GFX10-NEXT: s_mov_b32 s4, 0 230; GFX10-NEXT: flat_load_dword v0, v[3:4] 231; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start 232; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 233; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 234; GFX10-NEXT: v_mov_b32_e32 v1, v0 235; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 236; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 237; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 238; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 239; GFX10-NEXT: buffer_gl1_inv 240; GFX10-NEXT: buffer_gl0_inv 241; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 242; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 243; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 244; GFX10-NEXT: s_cbranch_execnz .LBB1_1 245; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 246; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 247; GFX10-NEXT: s_setpc_b64 s[30:31] 248; 249; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 250; GFX90A: ; %bb.0: 251; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 253; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 254; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 255; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 256; GFX90A-NEXT: ; implicit-def: $vgpr0 257; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 258; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 259; GFX90A-NEXT: s_cbranch_execnz .LBB1_3 260; GFX90A-NEXT: ; %bb.1: ; %Flow2 261; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 262; GFX90A-NEXT: s_cbranch_execnz .LBB1_8 263; GFX90A-NEXT: .LBB1_2: ; %atomicrmw.phi 264; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 265; GFX90A-NEXT: s_waitcnt vmcnt(0) 266; GFX90A-NEXT: s_setpc_b64 s[30:31] 267; GFX90A-NEXT: .LBB1_3: ; %atomicrmw.check.private 268; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 269; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 270; GFX90A-NEXT: ; implicit-def: $vgpr0 271; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 272; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 273; GFX90A-NEXT: s_cbranch_execz .LBB1_5 274; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 275; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 276; GFX90A-NEXT: s_waitcnt vmcnt(0) 277; GFX90A-NEXT: buffer_wbinvl1 278; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 279; GFX90A-NEXT: ; implicit-def: $vgpr2 280; GFX90A-NEXT: .LBB1_5: ; %Flow 281; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 282; GFX90A-NEXT: s_cbranch_execz .LBB1_7 283; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 284; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 285; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 286; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 287; GFX90A-NEXT: s_waitcnt vmcnt(0) 288; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 289; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 290; GFX90A-NEXT: .LBB1_7: ; %Flow1 291; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 292; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 293; GFX90A-NEXT: ; implicit-def: $vgpr2 294; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 295; GFX90A-NEXT: s_cbranch_execz .LBB1_2 296; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.shared 297; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 298; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 299; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 300; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 301; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 302; GFX90A-NEXT: s_waitcnt vmcnt(0) 303; GFX90A-NEXT: s_setpc_b64 s[30:31] 304; 305; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 306; GFX908: ; %bb.0: 307; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 309; GFX908-NEXT: s_mov_b64 s[4:5], 0 310; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start 311; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 312; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 313; GFX908-NEXT: v_mov_b32_e32 v4, v3 314; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 315; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 316; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 317; GFX908-NEXT: buffer_wbinvl1 318; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 319; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 320; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 321; GFX908-NEXT: s_cbranch_execnz .LBB1_1 322; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 323; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 324; GFX908-NEXT: v_mov_b32_e32 v0, v3 325; GFX908-NEXT: s_setpc_b64 s[30:31] 326; 327; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 328; GFX8: ; %bb.0: 329; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 331; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 332; GFX8-NEXT: flat_load_dword v0, v[3:4] 333; GFX8-NEXT: s_mov_b64 s[4:5], 0 334; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start 335; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 336; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 337; GFX8-NEXT: v_mov_b32_e32 v1, v0 338; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 339; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 340; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 341; GFX8-NEXT: buffer_wbinvl1 342; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 343; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 344; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 345; GFX8-NEXT: s_cbranch_execnz .LBB1_1 346; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 347; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 348; GFX8-NEXT: s_setpc_b64 s[30:31] 349; 350; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 351; GFX7: ; %bb.0: 352; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 353; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 354; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 355; GFX7-NEXT: flat_load_dword v0, v[3:4] 356; GFX7-NEXT: s_mov_b64 s[4:5], 0 357; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start 358; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 359; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 360; GFX7-NEXT: v_mov_b32_e32 v1, v0 361; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 362; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 363; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 364; GFX7-NEXT: buffer_wbinvl1 365; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 366; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 367; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 368; GFX7-NEXT: s_cbranch_execnz .LBB1_1 369; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 370; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 371; GFX7-NEXT: s_setpc_b64 s[30:31] 372 %gep = getelementptr float, ptr %ptr, i64 511 373 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 374 ret float %result 375} 376 377define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 378; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 379; GFX12: ; %bb.0: 380; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 381; GFX12-NEXT: s_wait_expcnt 0x0 382; GFX12-NEXT: s_wait_samplecnt 0x0 383; GFX12-NEXT: s_wait_bvhcnt 0x0 384; GFX12-NEXT: s_wait_kmcnt 0x0 385; GFX12-NEXT: s_wait_storecnt 0x0 386; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 387; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 388; GFX12-NEXT: global_inv scope:SCOPE_DEV 389; GFX12-NEXT: s_setpc_b64 s[30:31] 390; 391; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 392; GFX940: ; %bb.0: 393; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 394; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 395; GFX940-NEXT: s_nop 1 396; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 397; GFX940-NEXT: buffer_wbl2 sc1 398; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 399; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 400; GFX940-NEXT: buffer_inv sc1 401; GFX940-NEXT: s_setpc_b64 s[30:31] 402; 403; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 404; GFX11: ; %bb.0: 405; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 406; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 407; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 408; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 409; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 410; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 411; GFX11-NEXT: buffer_gl1_inv 412; GFX11-NEXT: buffer_gl0_inv 413; GFX11-NEXT: s_setpc_b64 s[30:31] 414; 415; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 416; GFX10: ; %bb.0: 417; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 418; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 419; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 420; GFX10-NEXT: s_mov_b32 s4, 0 421; GFX10-NEXT: flat_load_dword v0, v[3:4] 422; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start 423; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 424; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 425; GFX10-NEXT: v_mov_b32_e32 v1, v0 426; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 427; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 428; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 429; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 430; GFX10-NEXT: buffer_gl1_inv 431; GFX10-NEXT: buffer_gl0_inv 432; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 433; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 434; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 435; GFX10-NEXT: s_cbranch_execnz .LBB2_1 436; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 437; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 438; GFX10-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 441; GFX90A: ; %bb.0: 442; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 444; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 445; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 446; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 447; GFX90A-NEXT: ; implicit-def: $vgpr0 448; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 449; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 450; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 451; GFX90A-NEXT: ; %bb.1: ; %Flow2 452; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 453; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 454; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi 455; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 456; GFX90A-NEXT: s_waitcnt vmcnt(0) 457; GFX90A-NEXT: s_setpc_b64 s[30:31] 458; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private 459; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 460; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 461; GFX90A-NEXT: ; implicit-def: $vgpr0 462; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 463; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 464; GFX90A-NEXT: s_cbranch_execz .LBB2_5 465; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 466; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 467; GFX90A-NEXT: s_waitcnt vmcnt(0) 468; GFX90A-NEXT: buffer_wbinvl1 469; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 470; GFX90A-NEXT: ; implicit-def: $vgpr2 471; GFX90A-NEXT: .LBB2_5: ; %Flow 472; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 473; GFX90A-NEXT: s_cbranch_execz .LBB2_7 474; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 475; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 476; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 477; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 478; GFX90A-NEXT: s_waitcnt vmcnt(0) 479; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 480; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 481; GFX90A-NEXT: .LBB2_7: ; %Flow1 482; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 483; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 484; GFX90A-NEXT: ; implicit-def: $vgpr2 485; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 486; GFX90A-NEXT: s_cbranch_execz .LBB2_2 487; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared 488; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 489; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 490; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 491; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 492; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 493; GFX90A-NEXT: s_waitcnt vmcnt(0) 494; GFX90A-NEXT: s_setpc_b64 s[30:31] 495; 496; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 497; GFX908: ; %bb.0: 498; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 499; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 500; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 501; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 502; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 503; GFX908-NEXT: flat_load_dword v0, v[0:1] 504; GFX908-NEXT: s_mov_b64 s[4:5], 0 505; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start 506; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 507; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 508; GFX908-NEXT: v_mov_b32_e32 v1, v0 509; GFX908-NEXT: v_add_f32_e32 v0, v1, v2 510; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 511; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 512; GFX908-NEXT: buffer_wbinvl1 513; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 514; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 515; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 516; GFX908-NEXT: s_cbranch_execnz .LBB2_1 517; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 518; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 519; GFX908-NEXT: s_setpc_b64 s[30:31] 520; 521; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 522; GFX8: ; %bb.0: 523; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 524; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 525; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 526; GFX8-NEXT: flat_load_dword v0, v[3:4] 527; GFX8-NEXT: s_mov_b64 s[4:5], 0 528; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start 529; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 530; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 531; GFX8-NEXT: v_mov_b32_e32 v1, v0 532; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 533; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 534; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 535; GFX8-NEXT: buffer_wbinvl1 536; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 537; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 538; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 539; GFX8-NEXT: s_cbranch_execnz .LBB2_1 540; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 541; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 542; GFX8-NEXT: s_setpc_b64 s[30:31] 543; 544; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 545; GFX7: ; %bb.0: 546; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 547; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 548; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 549; GFX7-NEXT: flat_load_dword v0, v[3:4] 550; GFX7-NEXT: s_mov_b64 s[4:5], 0 551; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start 552; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 553; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 554; GFX7-NEXT: v_mov_b32_e32 v1, v0 555; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 556; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 557; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 558; GFX7-NEXT: buffer_wbinvl1 559; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 560; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 561; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 562; GFX7-NEXT: s_cbranch_execnz .LBB2_1 563; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 564; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 565; GFX7-NEXT: s_setpc_b64 s[30:31] 566 %gep = getelementptr float, ptr %ptr, i64 -512 567 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 568 ret float %result 569} 570 571define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 572; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 573; GFX12: ; %bb.0: 574; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 575; GFX12-NEXT: s_wait_expcnt 0x0 576; GFX12-NEXT: s_wait_samplecnt 0x0 577; GFX12-NEXT: s_wait_bvhcnt 0x0 578; GFX12-NEXT: s_wait_kmcnt 0x0 579; GFX12-NEXT: s_wait_storecnt 0x0 580; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 581; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 582; GFX12-NEXT: global_inv scope:SCOPE_DEV 583; GFX12-NEXT: s_setpc_b64 s[30:31] 584; 585; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 586; GFX940: ; %bb.0: 587; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 588; GFX940-NEXT: buffer_wbl2 sc1 589; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 590; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 591; GFX940-NEXT: buffer_inv sc1 592; GFX940-NEXT: s_setpc_b64 s[30:31] 593; 594; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 595; GFX11: ; %bb.0: 596; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 597; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 598; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 599; GFX11-NEXT: s_waitcnt lgkmcnt(0) 600; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 601; GFX11-NEXT: buffer_gl1_inv 602; GFX11-NEXT: buffer_gl0_inv 603; GFX11-NEXT: s_setpc_b64 s[30:31] 604; 605; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 606; GFX10: ; %bb.0: 607; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 608; GFX10-NEXT: flat_load_dword v4, v[0:1] 609; GFX10-NEXT: s_mov_b32 s4, 0 610; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start 611; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 612; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 613; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 614; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 615; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 616; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 617; GFX10-NEXT: buffer_gl1_inv 618; GFX10-NEXT: buffer_gl0_inv 619; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 620; GFX10-NEXT: v_mov_b32_e32 v4, v3 621; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 622; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 623; GFX10-NEXT: s_cbranch_execnz .LBB3_1 624; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 625; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 626; GFX10-NEXT: s_setpc_b64 s[30:31] 627; 628; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 629; GFX90A: ; %bb.0: 630; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 631; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 632; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 633; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 634; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 635; GFX90A-NEXT: s_cbranch_execnz .LBB3_3 636; GFX90A-NEXT: ; %bb.1: ; %Flow2 637; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 638; GFX90A-NEXT: s_cbranch_execnz .LBB3_8 639; GFX90A-NEXT: .LBB3_2: ; %atomicrmw.phi 640; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 641; GFX90A-NEXT: s_waitcnt vmcnt(0) 642; GFX90A-NEXT: s_setpc_b64 s[30:31] 643; GFX90A-NEXT: .LBB3_3: ; %atomicrmw.check.private 644; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 645; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 646; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 647; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 648; GFX90A-NEXT: s_cbranch_execz .LBB3_5 649; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 650; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 651; GFX90A-NEXT: s_waitcnt vmcnt(0) 652; GFX90A-NEXT: buffer_wbinvl1 653; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 654; GFX90A-NEXT: ; implicit-def: $vgpr2 655; GFX90A-NEXT: .LBB3_5: ; %Flow 656; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 657; GFX90A-NEXT: s_cbranch_execz .LBB3_7 658; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 659; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 660; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 661; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 662; GFX90A-NEXT: s_waitcnt vmcnt(0) 663; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 664; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 665; GFX90A-NEXT: .LBB3_7: ; %Flow1 666; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 667; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 668; GFX90A-NEXT: ; implicit-def: $vgpr2 669; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 670; GFX90A-NEXT: s_cbranch_execz .LBB3_2 671; GFX90A-NEXT: .LBB3_8: ; %atomicrmw.shared 672; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 673; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 674; GFX90A-NEXT: ds_add_f32 v0, v2 675; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 676; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 677; GFX90A-NEXT: s_waitcnt vmcnt(0) 678; GFX90A-NEXT: s_setpc_b64 s[30:31] 679; 680; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 681; GFX908: ; %bb.0: 682; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 684; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 685; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 686; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 687; GFX908-NEXT: s_cbranch_execnz .LBB3_3 688; GFX908-NEXT: ; %bb.1: ; %Flow2 689; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 690; GFX908-NEXT: s_cbranch_execnz .LBB3_8 691; GFX908-NEXT: .LBB3_2: ; %atomicrmw.phi 692; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 693; GFX908-NEXT: s_waitcnt vmcnt(0) 694; GFX908-NEXT: s_setpc_b64 s[30:31] 695; GFX908-NEXT: .LBB3_3: ; %atomicrmw.check.private 696; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 697; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 698; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 699; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 700; GFX908-NEXT: s_cbranch_execz .LBB3_5 701; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 702; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 703; GFX908-NEXT: s_waitcnt vmcnt(0) 704; GFX908-NEXT: buffer_wbinvl1 705; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 706; GFX908-NEXT: ; implicit-def: $vgpr2 707; GFX908-NEXT: .LBB3_5: ; %Flow 708; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 709; GFX908-NEXT: s_cbranch_execz .LBB3_7 710; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 711; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 712; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 713; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 714; GFX908-NEXT: s_waitcnt vmcnt(0) 715; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 716; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 717; GFX908-NEXT: .LBB3_7: ; %Flow1 718; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 719; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 720; GFX908-NEXT: ; implicit-def: $vgpr2 721; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 722; GFX908-NEXT: s_cbranch_execz .LBB3_2 723; GFX908-NEXT: .LBB3_8: ; %atomicrmw.shared 724; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 725; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 726; GFX908-NEXT: ds_add_f32 v0, v2 727; GFX908-NEXT: s_waitcnt lgkmcnt(0) 728; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 729; GFX908-NEXT: s_waitcnt vmcnt(0) 730; GFX908-NEXT: s_setpc_b64 s[30:31] 731; 732; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 733; GFX8: ; %bb.0: 734; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 735; GFX8-NEXT: flat_load_dword v4, v[0:1] 736; GFX8-NEXT: s_mov_b64 s[4:5], 0 737; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start 738; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 739; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 740; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 741; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 742; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 743; GFX8-NEXT: buffer_wbinvl1 744; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 745; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 746; GFX8-NEXT: v_mov_b32_e32 v4, v3 747; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 748; GFX8-NEXT: s_cbranch_execnz .LBB3_1 749; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 750; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 751; GFX8-NEXT: s_setpc_b64 s[30:31] 752; 753; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 754; GFX7: ; %bb.0: 755; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 756; GFX7-NEXT: flat_load_dword v4, v[0:1] 757; GFX7-NEXT: s_mov_b64 s[4:5], 0 758; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start 759; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 760; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 761; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 762; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 763; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 764; GFX7-NEXT: buffer_wbinvl1 765; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 766; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 767; GFX7-NEXT: v_mov_b32_e32 v4, v3 768; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 769; GFX7-NEXT: s_cbranch_execnz .LBB3_1 770; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 771; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 772; GFX7-NEXT: s_setpc_b64 s[30:31] 773 %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 774 ret void 775} 776 777define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 778; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 779; GFX12: ; %bb.0: 780; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 781; GFX12-NEXT: s_wait_expcnt 0x0 782; GFX12-NEXT: s_wait_samplecnt 0x0 783; GFX12-NEXT: s_wait_bvhcnt 0x0 784; GFX12-NEXT: s_wait_kmcnt 0x0 785; GFX12-NEXT: s_wait_storecnt 0x0 786; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 787; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 788; GFX12-NEXT: global_inv scope:SCOPE_DEV 789; GFX12-NEXT: s_setpc_b64 s[30:31] 790; 791; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 792; GFX940: ; %bb.0: 793; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 794; GFX940-NEXT: buffer_wbl2 sc1 795; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 796; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 797; GFX940-NEXT: buffer_inv sc1 798; GFX940-NEXT: s_setpc_b64 s[30:31] 799; 800; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 801; GFX11: ; %bb.0: 802; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 803; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 804; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 805; GFX11-NEXT: s_waitcnt lgkmcnt(0) 806; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 807; GFX11-NEXT: buffer_gl1_inv 808; GFX11-NEXT: buffer_gl0_inv 809; GFX11-NEXT: s_setpc_b64 s[30:31] 810; 811; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 812; GFX10: ; %bb.0: 813; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 814; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 815; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 816; GFX10-NEXT: s_mov_b32 s4, 0 817; GFX10-NEXT: flat_load_dword v4, v[0:1] 818; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start 819; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 820; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 821; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 822; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 823; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 824; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 825; GFX10-NEXT: buffer_gl1_inv 826; GFX10-NEXT: buffer_gl0_inv 827; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 828; GFX10-NEXT: v_mov_b32_e32 v4, v3 829; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 830; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 831; GFX10-NEXT: s_cbranch_execnz .LBB4_1 832; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 833; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 834; GFX10-NEXT: s_setpc_b64 s[30:31] 835; 836; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 837; GFX90A: ; %bb.0: 838; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 839; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 840; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 841; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 842; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 843; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 844; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 845; GFX90A-NEXT: s_cbranch_execnz .LBB4_3 846; GFX90A-NEXT: ; %bb.1: ; %Flow2 847; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 848; GFX90A-NEXT: s_cbranch_execnz .LBB4_8 849; GFX90A-NEXT: .LBB4_2: ; %atomicrmw.phi 850; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 851; GFX90A-NEXT: s_waitcnt vmcnt(0) 852; GFX90A-NEXT: s_setpc_b64 s[30:31] 853; GFX90A-NEXT: .LBB4_3: ; %atomicrmw.check.private 854; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 855; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 856; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 857; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 858; GFX90A-NEXT: s_cbranch_execz .LBB4_5 859; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 860; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 861; GFX90A-NEXT: s_waitcnt vmcnt(0) 862; GFX90A-NEXT: buffer_wbinvl1 863; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 864; GFX90A-NEXT: ; implicit-def: $vgpr2 865; GFX90A-NEXT: .LBB4_5: ; %Flow 866; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 867; GFX90A-NEXT: s_cbranch_execz .LBB4_7 868; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 869; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 870; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 871; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 872; GFX90A-NEXT: s_waitcnt vmcnt(0) 873; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 874; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 875; GFX90A-NEXT: .LBB4_7: ; %Flow1 876; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 877; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 878; GFX90A-NEXT: ; implicit-def: $vgpr2 879; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 880; GFX90A-NEXT: s_cbranch_execz .LBB4_2 881; GFX90A-NEXT: .LBB4_8: ; %atomicrmw.shared 882; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 883; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 884; GFX90A-NEXT: ds_add_f32 v0, v2 885; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 886; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 887; GFX90A-NEXT: s_waitcnt vmcnt(0) 888; GFX90A-NEXT: s_setpc_b64 s[30:31] 889; 890; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 891; GFX908: ; %bb.0: 892; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 893; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 894; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 895; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 896; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 897; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 898; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 899; GFX908-NEXT: s_cbranch_execnz .LBB4_3 900; GFX908-NEXT: ; %bb.1: ; %Flow2 901; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 902; GFX908-NEXT: s_cbranch_execnz .LBB4_8 903; GFX908-NEXT: .LBB4_2: ; %atomicrmw.phi 904; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 905; GFX908-NEXT: s_waitcnt vmcnt(0) 906; GFX908-NEXT: s_setpc_b64 s[30:31] 907; GFX908-NEXT: .LBB4_3: ; %atomicrmw.check.private 908; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 909; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 910; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 911; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 912; GFX908-NEXT: s_cbranch_execz .LBB4_5 913; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 914; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 915; GFX908-NEXT: s_waitcnt vmcnt(0) 916; GFX908-NEXT: buffer_wbinvl1 917; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 918; GFX908-NEXT: ; implicit-def: $vgpr2 919; GFX908-NEXT: .LBB4_5: ; %Flow 920; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 921; GFX908-NEXT: s_cbranch_execz .LBB4_7 922; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 923; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 924; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 925; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 926; GFX908-NEXT: s_waitcnt vmcnt(0) 927; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 928; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 929; GFX908-NEXT: .LBB4_7: ; %Flow1 930; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 931; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 932; GFX908-NEXT: ; implicit-def: $vgpr2 933; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 934; GFX908-NEXT: s_cbranch_execz .LBB4_2 935; GFX908-NEXT: .LBB4_8: ; %atomicrmw.shared 936; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 937; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 938; GFX908-NEXT: ds_add_f32 v0, v2 939; GFX908-NEXT: s_waitcnt lgkmcnt(0) 940; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 941; GFX908-NEXT: s_waitcnt vmcnt(0) 942; GFX908-NEXT: s_setpc_b64 s[30:31] 943; 944; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 945; GFX8: ; %bb.0: 946; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 947; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 948; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 949; GFX8-NEXT: flat_load_dword v4, v[0:1] 950; GFX8-NEXT: s_mov_b64 s[4:5], 0 951; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 952; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 953; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 954; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 955; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 956; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 957; GFX8-NEXT: buffer_wbinvl1 958; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 959; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 960; GFX8-NEXT: v_mov_b32_e32 v4, v3 961; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 962; GFX8-NEXT: s_cbranch_execnz .LBB4_1 963; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 964; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 965; GFX8-NEXT: s_setpc_b64 s[30:31] 966; 967; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 968; GFX7: ; %bb.0: 969; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 970; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 971; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 972; GFX7-NEXT: flat_load_dword v4, v[0:1] 973; GFX7-NEXT: s_mov_b64 s[4:5], 0 974; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start 975; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 976; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 977; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 978; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 979; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 980; GFX7-NEXT: buffer_wbinvl1 981; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 982; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 983; GFX7-NEXT: v_mov_b32_e32 v4, v3 984; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 985; GFX7-NEXT: s_cbranch_execnz .LBB4_1 986; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 987; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 988; GFX7-NEXT: s_setpc_b64 s[30:31] 989 %gep = getelementptr float, ptr %ptr, i64 511 990 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 991 ret void 992} 993 994define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 995; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 996; GFX12: ; %bb.0: 997; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 998; GFX12-NEXT: s_wait_expcnt 0x0 999; GFX12-NEXT: s_wait_samplecnt 0x0 1000; GFX12-NEXT: s_wait_bvhcnt 0x0 1001; GFX12-NEXT: s_wait_kmcnt 0x0 1002; GFX12-NEXT: s_wait_storecnt 0x0 1003; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 1004; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1005; GFX12-NEXT: global_inv scope:SCOPE_DEV 1006; GFX12-NEXT: s_setpc_b64 s[30:31] 1007; 1008; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1009; GFX940: ; %bb.0: 1010; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1011; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1012; GFX940-NEXT: s_nop 1 1013; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1014; GFX940-NEXT: buffer_wbl2 sc1 1015; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 1016; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1017; GFX940-NEXT: buffer_inv sc1 1018; GFX940-NEXT: s_setpc_b64 s[30:31] 1019; 1020; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1021; GFX11: ; %bb.0: 1022; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1023; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1024; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1025; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1026; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 1027; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1029; GFX11-NEXT: buffer_gl1_inv 1030; GFX11-NEXT: buffer_gl0_inv 1031; GFX11-NEXT: s_setpc_b64 s[30:31] 1032; 1033; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1034; GFX10: ; %bb.0: 1035; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1036; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1037; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1038; GFX10-NEXT: s_mov_b32 s4, 0 1039; GFX10-NEXT: flat_load_dword v4, v[0:1] 1040; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start 1041; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1042; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1043; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 1044; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1045; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1046; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1047; GFX10-NEXT: buffer_gl1_inv 1048; GFX10-NEXT: buffer_gl0_inv 1049; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1050; GFX10-NEXT: v_mov_b32_e32 v4, v3 1051; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1052; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1053; GFX10-NEXT: s_cbranch_execnz .LBB5_1 1054; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1055; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1056; GFX10-NEXT: s_setpc_b64 s[30:31] 1057; 1058; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1059; GFX90A: ; %bb.0: 1060; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1061; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1062; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1063; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 1064; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 1065; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 1066; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1067; GFX90A-NEXT: s_cbranch_execnz .LBB5_3 1068; GFX90A-NEXT: ; %bb.1: ; %Flow2 1069; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1070; GFX90A-NEXT: s_cbranch_execnz .LBB5_8 1071; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.phi 1072; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1073; GFX90A-NEXT: s_waitcnt vmcnt(0) 1074; GFX90A-NEXT: s_setpc_b64 s[30:31] 1075; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private 1076; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 1077; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 1078; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 1079; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 1080; GFX90A-NEXT: s_cbranch_execz .LBB5_5 1081; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 1082; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 1083; GFX90A-NEXT: s_waitcnt vmcnt(0) 1084; GFX90A-NEXT: buffer_wbinvl1 1085; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 1086; GFX90A-NEXT: ; implicit-def: $vgpr2 1087; GFX90A-NEXT: .LBB5_5: ; %Flow 1088; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 1089; GFX90A-NEXT: s_cbranch_execz .LBB5_7 1090; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 1091; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1092; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1093; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 1094; GFX90A-NEXT: s_waitcnt vmcnt(0) 1095; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 1096; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 1097; GFX90A-NEXT: .LBB5_7: ; %Flow1 1098; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 1099; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 1100; GFX90A-NEXT: ; implicit-def: $vgpr2 1101; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1102; GFX90A-NEXT: s_cbranch_execz .LBB5_2 1103; GFX90A-NEXT: .LBB5_8: ; %atomicrmw.shared 1104; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1105; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1106; GFX90A-NEXT: ds_add_f32 v0, v2 1107; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1109; GFX90A-NEXT: s_waitcnt vmcnt(0) 1110; GFX90A-NEXT: s_setpc_b64 s[30:31] 1111; 1112; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1113; GFX908: ; %bb.0: 1114; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1115; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1116; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1117; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 1118; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 1119; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 1120; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1121; GFX908-NEXT: s_cbranch_execnz .LBB5_3 1122; GFX908-NEXT: ; %bb.1: ; %Flow2 1123; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1124; GFX908-NEXT: s_cbranch_execnz .LBB5_8 1125; GFX908-NEXT: .LBB5_2: ; %atomicrmw.phi 1126; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1127; GFX908-NEXT: s_waitcnt vmcnt(0) 1128; GFX908-NEXT: s_setpc_b64 s[30:31] 1129; GFX908-NEXT: .LBB5_3: ; %atomicrmw.check.private 1130; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 1131; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 1132; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 1133; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 1134; GFX908-NEXT: s_cbranch_execz .LBB5_5 1135; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 1136; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 1137; GFX908-NEXT: s_waitcnt vmcnt(0) 1138; GFX908-NEXT: buffer_wbinvl1 1139; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 1140; GFX908-NEXT: ; implicit-def: $vgpr2 1141; GFX908-NEXT: .LBB5_5: ; %Flow 1142; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 1143; GFX908-NEXT: s_cbranch_execz .LBB5_7 1144; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 1145; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1146; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1147; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 1148; GFX908-NEXT: s_waitcnt vmcnt(0) 1149; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 1150; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 1151; GFX908-NEXT: .LBB5_7: ; %Flow1 1152; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 1153; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 1154; GFX908-NEXT: ; implicit-def: $vgpr2 1155; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1156; GFX908-NEXT: s_cbranch_execz .LBB5_2 1157; GFX908-NEXT: .LBB5_8: ; %atomicrmw.shared 1158; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1159; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1160; GFX908-NEXT: ds_add_f32 v0, v2 1161; GFX908-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1163; GFX908-NEXT: s_waitcnt vmcnt(0) 1164; GFX908-NEXT: s_setpc_b64 s[30:31] 1165; 1166; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1167; GFX8: ; %bb.0: 1168; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1169; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 1170; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 1171; GFX8-NEXT: flat_load_dword v4, v[0:1] 1172; GFX8-NEXT: s_mov_b64 s[4:5], 0 1173; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 1174; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1175; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1176; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 1177; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1178; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1179; GFX8-NEXT: buffer_wbinvl1 1180; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1181; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1182; GFX8-NEXT: v_mov_b32_e32 v4, v3 1183; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1184; GFX8-NEXT: s_cbranch_execnz .LBB5_1 1185; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1186; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1187; GFX8-NEXT: s_setpc_b64 s[30:31] 1188; 1189; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1190; GFX7: ; %bb.0: 1191; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1192; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 1193; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 1194; GFX7-NEXT: flat_load_dword v4, v[0:1] 1195; GFX7-NEXT: s_mov_b64 s[4:5], 0 1196; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start 1197; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1198; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1199; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 1200; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1201; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1202; GFX7-NEXT: buffer_wbinvl1 1203; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1204; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1205; GFX7-NEXT: v_mov_b32_e32 v4, v3 1206; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1207; GFX7-NEXT: s_cbranch_execnz .LBB5_1 1208; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1209; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1210; GFX7-NEXT: s_setpc_b64 s[30:31] 1211 %gep = getelementptr float, ptr %ptr, i64 -512 1212 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1213 ret void 1214} 1215 1216define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 1217; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1218; GFX12: ; %bb.0: 1219; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1220; GFX12-NEXT: s_wait_expcnt 0x0 1221; GFX12-NEXT: s_wait_samplecnt 0x0 1222; GFX12-NEXT: s_wait_bvhcnt 0x0 1223; GFX12-NEXT: s_wait_kmcnt 0x0 1224; GFX12-NEXT: global_wb scope:SCOPE_SYS 1225; GFX12-NEXT: s_wait_storecnt 0x0 1226; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1227; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1228; GFX12-NEXT: global_inv scope:SCOPE_SYS 1229; GFX12-NEXT: s_setpc_b64 s[30:31] 1230; 1231; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1232; GFX940: ; %bb.0: 1233; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1234; GFX940-NEXT: buffer_wbl2 sc0 sc1 1235; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 1236; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1237; GFX940-NEXT: buffer_inv sc0 sc1 1238; GFX940-NEXT: s_setpc_b64 s[30:31] 1239; 1240; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1241; GFX11: ; %bb.0: 1242; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1243; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1244; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc 1245; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1246; GFX11-NEXT: buffer_gl1_inv 1247; GFX11-NEXT: buffer_gl0_inv 1248; GFX11-NEXT: s_setpc_b64 s[30:31] 1249; 1250; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1251; GFX10: ; %bb.0: 1252; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1253; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 1254; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 1255; GFX10-NEXT: s_mov_b32 s4, 0 1256; GFX10-NEXT: flat_load_dword v0, v[3:4] 1257; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start 1258; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1259; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1260; GFX10-NEXT: v_mov_b32_e32 v1, v0 1261; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 1262; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1263; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1264; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1265; GFX10-NEXT: buffer_gl1_inv 1266; GFX10-NEXT: buffer_gl0_inv 1267; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 1268; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1269; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1270; GFX10-NEXT: s_cbranch_execnz .LBB6_1 1271; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1272; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1273; GFX10-NEXT: s_setpc_b64 s[30:31] 1274; 1275; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1276; GFX90A: ; %bb.0: 1277; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1278; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 1279; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 1280; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 1281; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 1282; GFX90A-NEXT: ; implicit-def: $vgpr0 1283; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 1284; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1285; GFX90A-NEXT: s_cbranch_execnz .LBB6_3 1286; GFX90A-NEXT: ; %bb.1: ; %Flow2 1287; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1288; GFX90A-NEXT: s_cbranch_execnz .LBB6_8 1289; GFX90A-NEXT: .LBB6_2: ; %atomicrmw.phi 1290; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1291; GFX90A-NEXT: s_waitcnt vmcnt(0) 1292; GFX90A-NEXT: s_setpc_b64 s[30:31] 1293; GFX90A-NEXT: .LBB6_3: ; %atomicrmw.check.private 1294; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 1295; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 1296; GFX90A-NEXT: ; implicit-def: $vgpr0 1297; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 1298; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 1299; GFX90A-NEXT: s_cbranch_execz .LBB6_5 1300; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 1301; GFX90A-NEXT: buffer_wbl2 1302; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 1303; GFX90A-NEXT: s_waitcnt vmcnt(0) 1304; GFX90A-NEXT: buffer_invl2 1305; GFX90A-NEXT: buffer_wbinvl1 1306; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 1307; GFX90A-NEXT: ; implicit-def: $vgpr2 1308; GFX90A-NEXT: .LBB6_5: ; %Flow 1309; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 1310; GFX90A-NEXT: s_cbranch_execz .LBB6_7 1311; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 1312; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 1313; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 1314; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 1315; GFX90A-NEXT: s_waitcnt vmcnt(0) 1316; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 1317; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 1318; GFX90A-NEXT: .LBB6_7: ; %Flow1 1319; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 1320; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 1321; GFX90A-NEXT: ; implicit-def: $vgpr2 1322; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1323; GFX90A-NEXT: s_cbranch_execz .LBB6_2 1324; GFX90A-NEXT: .LBB6_8: ; %atomicrmw.shared 1325; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 1326; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 1327; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 1328; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1330; GFX90A-NEXT: s_waitcnt vmcnt(0) 1331; GFX90A-NEXT: s_setpc_b64 s[30:31] 1332; 1333; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1334; GFX908: ; %bb.0: 1335; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1336; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1337; GFX908-NEXT: s_mov_b64 s[4:5], 0 1338; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 1339; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1340; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1341; GFX908-NEXT: v_mov_b32_e32 v4, v3 1342; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 1343; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1344; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1345; GFX908-NEXT: buffer_wbinvl1 1346; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1347; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1348; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1349; GFX908-NEXT: s_cbranch_execnz .LBB6_1 1350; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1351; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1352; GFX908-NEXT: v_mov_b32_e32 v0, v3 1353; GFX908-NEXT: s_setpc_b64 s[30:31] 1354; 1355; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1356; GFX8: ; %bb.0: 1357; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1358; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 1359; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1360; GFX8-NEXT: flat_load_dword v0, v[3:4] 1361; GFX8-NEXT: s_mov_b64 s[4:5], 0 1362; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 1363; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1364; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1365; GFX8-NEXT: v_mov_b32_e32 v1, v0 1366; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 1367; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1368; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1369; GFX8-NEXT: buffer_wbinvl1 1370; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1371; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1372; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1373; GFX8-NEXT: s_cbranch_execnz .LBB6_1 1374; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1375; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1376; GFX8-NEXT: s_setpc_b64 s[30:31] 1377; 1378; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1379; GFX7: ; %bb.0: 1380; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1381; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 1382; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1383; GFX7-NEXT: flat_load_dword v0, v[3:4] 1384; GFX7-NEXT: s_mov_b64 s[4:5], 0 1385; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start 1386; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1387; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1388; GFX7-NEXT: v_mov_b32_e32 v1, v0 1389; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 1390; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1391; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1392; GFX7-NEXT: buffer_wbinvl1 1393; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1394; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1395; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1396; GFX7-NEXT: s_cbranch_execnz .LBB6_1 1397; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1398; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1399; GFX7-NEXT: s_setpc_b64 s[30:31] 1400 %gep = getelementptr float, ptr %ptr, i64 511 1401 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1402 ret float %result 1403} 1404 1405define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 1406; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1407; GFX12: ; %bb.0: 1408; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1409; GFX12-NEXT: s_wait_expcnt 0x0 1410; GFX12-NEXT: s_wait_samplecnt 0x0 1411; GFX12-NEXT: s_wait_bvhcnt 0x0 1412; GFX12-NEXT: s_wait_kmcnt 0x0 1413; GFX12-NEXT: global_wb scope:SCOPE_SYS 1414; GFX12-NEXT: s_wait_storecnt 0x0 1415; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS 1416; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1417; GFX12-NEXT: global_inv scope:SCOPE_SYS 1418; GFX12-NEXT: s_setpc_b64 s[30:31] 1419; 1420; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1421; GFX940: ; %bb.0: 1422; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1423; GFX940-NEXT: buffer_wbl2 sc0 sc1 1424; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 1425; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1426; GFX940-NEXT: buffer_inv sc0 sc1 1427; GFX940-NEXT: s_setpc_b64 s[30:31] 1428; 1429; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1430; GFX11: ; %bb.0: 1431; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1432; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1433; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1434; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1435; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1436; GFX11-NEXT: buffer_gl1_inv 1437; GFX11-NEXT: buffer_gl0_inv 1438; GFX11-NEXT: s_setpc_b64 s[30:31] 1439; 1440; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1441; GFX10: ; %bb.0: 1442; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1443; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1444; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1445; GFX10-NEXT: s_mov_b32 s4, 0 1446; GFX10-NEXT: flat_load_dword v4, v[0:1] 1447; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start 1448; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1449; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1450; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 1451; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1452; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1453; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1454; GFX10-NEXT: buffer_gl1_inv 1455; GFX10-NEXT: buffer_gl0_inv 1456; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1457; GFX10-NEXT: v_mov_b32_e32 v4, v3 1458; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1459; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1460; GFX10-NEXT: s_cbranch_execnz .LBB7_1 1461; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1462; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1463; GFX10-NEXT: s_setpc_b64 s[30:31] 1464; 1465; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1466; GFX90A: ; %bb.0: 1467; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1468; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 1469; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1470; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 1471; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 1472; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 1473; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1474; GFX90A-NEXT: s_cbranch_execnz .LBB7_3 1475; GFX90A-NEXT: ; %bb.1: ; %Flow2 1476; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1477; GFX90A-NEXT: s_cbranch_execnz .LBB7_8 1478; GFX90A-NEXT: .LBB7_2: ; %atomicrmw.phi 1479; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1480; GFX90A-NEXT: s_waitcnt vmcnt(0) 1481; GFX90A-NEXT: s_setpc_b64 s[30:31] 1482; GFX90A-NEXT: .LBB7_3: ; %atomicrmw.check.private 1483; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 1484; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 1485; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 1486; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 1487; GFX90A-NEXT: s_cbranch_execz .LBB7_5 1488; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 1489; GFX90A-NEXT: buffer_wbl2 1490; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 1491; GFX90A-NEXT: s_waitcnt vmcnt(0) 1492; GFX90A-NEXT: buffer_invl2 1493; GFX90A-NEXT: buffer_wbinvl1 1494; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 1495; GFX90A-NEXT: ; implicit-def: $vgpr2 1496; GFX90A-NEXT: .LBB7_5: ; %Flow 1497; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 1498; GFX90A-NEXT: s_cbranch_execz .LBB7_7 1499; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 1500; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1501; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1502; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 1503; GFX90A-NEXT: s_waitcnt vmcnt(0) 1504; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 1505; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 1506; GFX90A-NEXT: .LBB7_7: ; %Flow1 1507; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 1508; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 1509; GFX90A-NEXT: ; implicit-def: $vgpr2 1510; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1511; GFX90A-NEXT: s_cbranch_execz .LBB7_2 1512; GFX90A-NEXT: .LBB7_8: ; %atomicrmw.shared 1513; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1514; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1515; GFX90A-NEXT: ds_add_f32 v0, v2 1516; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1518; GFX90A-NEXT: s_waitcnt vmcnt(0) 1519; GFX90A-NEXT: s_setpc_b64 s[30:31] 1520; 1521; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1522; GFX908: ; %bb.0: 1523; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1524; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 1525; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1526; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 1527; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 1528; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 1529; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1530; GFX908-NEXT: s_cbranch_execnz .LBB7_3 1531; GFX908-NEXT: ; %bb.1: ; %Flow2 1532; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1533; GFX908-NEXT: s_cbranch_execnz .LBB7_8 1534; GFX908-NEXT: .LBB7_2: ; %atomicrmw.phi 1535; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1536; GFX908-NEXT: s_waitcnt vmcnt(0) 1537; GFX908-NEXT: s_setpc_b64 s[30:31] 1538; GFX908-NEXT: .LBB7_3: ; %atomicrmw.check.private 1539; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 1540; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 1541; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 1542; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 1543; GFX908-NEXT: s_cbranch_execz .LBB7_5 1544; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 1545; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 1546; GFX908-NEXT: s_waitcnt vmcnt(0) 1547; GFX908-NEXT: buffer_wbinvl1 1548; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 1549; GFX908-NEXT: ; implicit-def: $vgpr2 1550; GFX908-NEXT: .LBB7_5: ; %Flow 1551; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 1552; GFX908-NEXT: s_cbranch_execz .LBB7_7 1553; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 1554; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1555; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1556; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 1557; GFX908-NEXT: s_waitcnt vmcnt(0) 1558; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 1559; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 1560; GFX908-NEXT: .LBB7_7: ; %Flow1 1561; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 1562; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 1563; GFX908-NEXT: ; implicit-def: $vgpr2 1564; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 1565; GFX908-NEXT: s_cbranch_execz .LBB7_2 1566; GFX908-NEXT: .LBB7_8: ; %atomicrmw.shared 1567; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 1568; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1569; GFX908-NEXT: ds_add_f32 v0, v2 1570; GFX908-NEXT: s_waitcnt lgkmcnt(0) 1571; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1572; GFX908-NEXT: s_waitcnt vmcnt(0) 1573; GFX908-NEXT: s_setpc_b64 s[30:31] 1574; 1575; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1576; GFX8: ; %bb.0: 1577; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1578; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 1579; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1580; GFX8-NEXT: flat_load_dword v4, v[0:1] 1581; GFX8-NEXT: s_mov_b64 s[4:5], 0 1582; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start 1583; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1584; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1585; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 1586; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1587; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1588; GFX8-NEXT: buffer_wbinvl1 1589; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1590; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1591; GFX8-NEXT: v_mov_b32_e32 v4, v3 1592; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1593; GFX8-NEXT: s_cbranch_execnz .LBB7_1 1594; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1595; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1596; GFX8-NEXT: s_setpc_b64 s[30:31] 1597; 1598; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1599; GFX7: ; %bb.0: 1600; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1601; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1602; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1603; GFX7-NEXT: flat_load_dword v4, v[0:1] 1604; GFX7-NEXT: s_mov_b64 s[4:5], 0 1605; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start 1606; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1607; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1608; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 1609; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1610; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1611; GFX7-NEXT: buffer_wbinvl1 1612; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1613; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1614; GFX7-NEXT: v_mov_b32_e32 v4, v3 1615; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1616; GFX7-NEXT: s_cbranch_execnz .LBB7_1 1617; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1618; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1619; GFX7-NEXT: s_setpc_b64 s[30:31] 1620 %gep = getelementptr float, ptr %ptr, i64 511 1621 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1622 ret void 1623} 1624 1625define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) { 1626; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1627; GFX12: ; %bb.0: 1628; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1629; GFX12-NEXT: s_wait_expcnt 0x0 1630; GFX12-NEXT: s_wait_samplecnt 0x0 1631; GFX12-NEXT: s_wait_bvhcnt 0x0 1632; GFX12-NEXT: s_wait_kmcnt 0x0 1633; GFX12-NEXT: s_wait_storecnt 0x0 1634; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 1635; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1636; GFX12-NEXT: global_inv scope:SCOPE_DEV 1637; GFX12-NEXT: s_setpc_b64 s[30:31] 1638; 1639; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1640; GFX940: ; %bb.0: 1641; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1642; GFX940-NEXT: buffer_wbl2 sc1 1643; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1644; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1645; GFX940-NEXT: buffer_inv sc1 1646; GFX940-NEXT: s_setpc_b64 s[30:31] 1647; 1648; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1649; GFX11: ; %bb.0: 1650; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1651; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 1652; GFX11-NEXT: s_mov_b32 s0, 0 1653; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 1654; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1655; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1656; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 1657; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1658; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 1659; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1660; GFX11-NEXT: buffer_gl1_inv 1661; GFX11-NEXT: buffer_gl0_inv 1662; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1663; GFX11-NEXT: v_mov_b32_e32 v4, v3 1664; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1665; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1666; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1667; GFX11-NEXT: s_cbranch_execnz .LBB8_1 1668; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1669; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1670; GFX11-NEXT: s_setpc_b64 s[30:31] 1671; 1672; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1673; GFX10: ; %bb.0: 1674; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1675; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1676; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1677; GFX10-NEXT: s_mov_b32 s4, 0 1678; GFX10-NEXT: flat_load_dword v4, v[0:1] 1679; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 1680; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1681; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1682; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 1683; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1684; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1685; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1686; GFX10-NEXT: buffer_gl1_inv 1687; GFX10-NEXT: buffer_gl0_inv 1688; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1689; GFX10-NEXT: v_mov_b32_e32 v4, v3 1690; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1691; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1692; GFX10-NEXT: s_cbranch_execnz .LBB8_1 1693; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1694; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1695; GFX10-NEXT: s_setpc_b64 s[30:31] 1696; 1697; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1698; GFX90A: ; %bb.0: 1699; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1700; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 1701; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1702; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start 1703; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1704; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1705; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 1706; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1707; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1708; GFX90A-NEXT: buffer_wbinvl1 1709; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1710; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1711; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1712; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1713; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 1714; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1715; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1716; GFX90A-NEXT: s_setpc_b64 s[30:31] 1717; 1718; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1719; GFX908: ; %bb.0: 1720; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1721; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 1722; GFX908-NEXT: s_mov_b64 s[4:5], 0 1723; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 1724; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1725; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1726; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 1727; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1728; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1729; GFX908-NEXT: buffer_wbinvl1 1730; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1731; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1732; GFX908-NEXT: v_mov_b32_e32 v4, v3 1733; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1734; GFX908-NEXT: s_cbranch_execnz .LBB8_1 1735; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1736; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1737; GFX908-NEXT: s_setpc_b64 s[30:31] 1738; 1739; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1740; GFX8: ; %bb.0: 1741; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1742; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 1743; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1744; GFX8-NEXT: flat_load_dword v4, v[0:1] 1745; GFX8-NEXT: s_mov_b64 s[4:5], 0 1746; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 1747; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1748; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1749; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 1750; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1751; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1752; GFX8-NEXT: buffer_wbinvl1 1753; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1754; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1755; GFX8-NEXT: v_mov_b32_e32 v4, v3 1756; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1757; GFX8-NEXT: s_cbranch_execnz .LBB8_1 1758; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1759; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1760; GFX8-NEXT: s_setpc_b64 s[30:31] 1761; 1762; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: 1763; GFX7: ; %bb.0: 1764; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1765; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1766; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1767; GFX7-NEXT: flat_load_dword v4, v[0:1] 1768; GFX7-NEXT: s_mov_b64 s[4:5], 0 1769; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start 1770; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1771; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1772; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 1773; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1774; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1775; GFX7-NEXT: buffer_wbinvl1 1776; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1777; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1778; GFX7-NEXT: v_mov_b32_e32 v4, v3 1779; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1780; GFX7-NEXT: s_cbranch_execnz .LBB8_1 1781; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1782; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1783; GFX7-NEXT: s_setpc_b64 s[30:31] 1784 %gep = getelementptr float, ptr %ptr, i64 511 1785 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst 1786 ret void 1787} 1788 1789define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 1790; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1791; GFX12: ; %bb.0: 1792; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1793; GFX12-NEXT: s_wait_expcnt 0x0 1794; GFX12-NEXT: s_wait_samplecnt 0x0 1795; GFX12-NEXT: s_wait_bvhcnt 0x0 1796; GFX12-NEXT: s_wait_kmcnt 0x0 1797; GFX12-NEXT: s_wait_storecnt 0x0 1798; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 1799; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1800; GFX12-NEXT: global_inv scope:SCOPE_DEV 1801; GFX12-NEXT: s_setpc_b64 s[30:31] 1802; 1803; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1804; GFX940: ; %bb.0: 1805; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1806; GFX940-NEXT: buffer_wbl2 sc1 1807; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1808; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1809; GFX940-NEXT: buffer_inv sc1 1810; GFX940-NEXT: s_setpc_b64 s[30:31] 1811; 1812; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1813; GFX11: ; %bb.0: 1814; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1815; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1816; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1817; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1818; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1819; GFX11-NEXT: buffer_gl1_inv 1820; GFX11-NEXT: buffer_gl0_inv 1821; GFX11-NEXT: s_setpc_b64 s[30:31] 1822; 1823; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1824; GFX10: ; %bb.0: 1825; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1826; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1827; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1828; GFX10-NEXT: s_mov_b32 s4, 0 1829; GFX10-NEXT: flat_load_dword v4, v[0:1] 1830; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start 1831; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1832; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1833; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 1834; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1835; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1836; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1837; GFX10-NEXT: buffer_gl1_inv 1838; GFX10-NEXT: buffer_gl0_inv 1839; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1840; GFX10-NEXT: v_mov_b32_e32 v4, v3 1841; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1842; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1843; GFX10-NEXT: s_cbranch_execnz .LBB9_1 1844; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1845; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1846; GFX10-NEXT: s_setpc_b64 s[30:31] 1847; 1848; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1849; GFX90A: ; %bb.0: 1850; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1851; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 1852; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1853; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start 1854; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1855; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1856; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 1857; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1858; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1859; GFX90A-NEXT: buffer_wbinvl1 1860; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1861; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1862; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1863; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1864; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 1865; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1866; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1867; GFX90A-NEXT: s_setpc_b64 s[30:31] 1868; 1869; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1870; GFX908: ; %bb.0: 1871; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1872; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 1873; GFX908-NEXT: s_mov_b64 s[4:5], 0 1874; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 1875; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1876; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1877; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 1878; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1879; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1880; GFX908-NEXT: buffer_wbinvl1 1881; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1882; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1883; GFX908-NEXT: v_mov_b32_e32 v4, v3 1884; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1885; GFX908-NEXT: s_cbranch_execnz .LBB9_1 1886; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1887; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1888; GFX908-NEXT: s_setpc_b64 s[30:31] 1889; 1890; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1891; GFX8: ; %bb.0: 1892; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1893; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 1894; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1895; GFX8-NEXT: flat_load_dword v4, v[0:1] 1896; GFX8-NEXT: s_mov_b64 s[4:5], 0 1897; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 1898; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1899; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1900; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 1901; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1902; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1903; GFX8-NEXT: buffer_wbinvl1 1904; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1905; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1906; GFX8-NEXT: v_mov_b32_e32 v4, v3 1907; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1908; GFX8-NEXT: s_cbranch_execnz .LBB9_1 1909; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1910; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1911; GFX8-NEXT: s_setpc_b64 s[30:31] 1912; 1913; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: 1914; GFX7: ; %bb.0: 1915; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1916; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1917; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1918; GFX7-NEXT: flat_load_dword v4, v[0:1] 1919; GFX7-NEXT: s_mov_b64 s[4:5], 0 1920; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start 1921; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1922; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1923; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 1924; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1925; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1926; GFX7-NEXT: buffer_wbinvl1 1927; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1928; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1929; GFX7-NEXT: v_mov_b32_e32 v4, v3 1930; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1931; GFX7-NEXT: s_cbranch_execnz .LBB9_1 1932; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1933; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1934; GFX7-NEXT: s_setpc_b64 s[30:31] 1935 %gep = getelementptr float, ptr %ptr, i64 511 1936 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1937 ret void 1938} 1939 1940define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 1941; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1942; GFX12: ; %bb.0: 1943; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1944; GFX12-NEXT: s_wait_expcnt 0x0 1945; GFX12-NEXT: s_wait_samplecnt 0x0 1946; GFX12-NEXT: s_wait_bvhcnt 0x0 1947; GFX12-NEXT: s_wait_kmcnt 0x0 1948; GFX12-NEXT: s_wait_storecnt 0x0 1949; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 1950; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1951; GFX12-NEXT: global_inv scope:SCOPE_DEV 1952; GFX12-NEXT: s_setpc_b64 s[30:31] 1953; 1954; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1955; GFX940: ; %bb.0: 1956; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1957; GFX940-NEXT: buffer_wbl2 sc1 1958; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1959; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1960; GFX940-NEXT: buffer_inv sc1 1961; GFX940-NEXT: s_setpc_b64 s[30:31] 1962; 1963; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1964; GFX11: ; %bb.0: 1965; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1966; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1967; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 1968; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1969; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1970; GFX11-NEXT: buffer_gl1_inv 1971; GFX11-NEXT: buffer_gl0_inv 1972; GFX11-NEXT: s_setpc_b64 s[30:31] 1973; 1974; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 1975; GFX10: ; %bb.0: 1976; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1978; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1979; GFX10-NEXT: s_mov_b32 s4, 0 1980; GFX10-NEXT: flat_load_dword v4, v[0:1] 1981; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start 1982; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1983; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1984; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 1985; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1986; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1987; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1988; GFX10-NEXT: buffer_gl1_inv 1989; GFX10-NEXT: buffer_gl0_inv 1990; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1991; GFX10-NEXT: v_mov_b32_e32 v4, v3 1992; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1993; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1994; GFX10-NEXT: s_cbranch_execnz .LBB10_1 1995; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1996; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1997; GFX10-NEXT: s_setpc_b64 s[30:31] 1998; 1999; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 2000; GFX90A: ; %bb.0: 2001; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2002; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 2003; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2004; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 2005; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 2006; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2007; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2008; GFX90A-NEXT: s_cbranch_execnz .LBB10_3 2009; GFX90A-NEXT: ; %bb.1: ; %Flow2 2010; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2011; GFX90A-NEXT: s_cbranch_execnz .LBB10_8 2012; GFX90A-NEXT: .LBB10_2: ; %atomicrmw.phi 2013; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2014; GFX90A-NEXT: s_waitcnt vmcnt(0) 2015; GFX90A-NEXT: s_setpc_b64 s[30:31] 2016; GFX90A-NEXT: .LBB10_3: ; %atomicrmw.check.private 2017; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 2018; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 2019; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 2020; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2021; GFX90A-NEXT: s_cbranch_execz .LBB10_5 2022; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 2023; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 2024; GFX90A-NEXT: s_waitcnt vmcnt(0) 2025; GFX90A-NEXT: buffer_wbinvl1 2026; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2027; GFX90A-NEXT: ; implicit-def: $vgpr2 2028; GFX90A-NEXT: .LBB10_5: ; %Flow 2029; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2030; GFX90A-NEXT: s_cbranch_execz .LBB10_7 2031; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 2032; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2033; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2034; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 2035; GFX90A-NEXT: s_waitcnt vmcnt(0) 2036; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 2037; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 2038; GFX90A-NEXT: .LBB10_7: ; %Flow1 2039; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 2040; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2041; GFX90A-NEXT: ; implicit-def: $vgpr2 2042; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2043; GFX90A-NEXT: s_cbranch_execz .LBB10_2 2044; GFX90A-NEXT: .LBB10_8: ; %atomicrmw.shared 2045; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2046; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2047; GFX90A-NEXT: ds_add_f32 v0, v2 2048; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2049; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2050; GFX90A-NEXT: s_waitcnt vmcnt(0) 2051; GFX90A-NEXT: s_setpc_b64 s[30:31] 2052; 2053; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 2054; GFX908: ; %bb.0: 2055; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2056; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 2057; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2058; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 2059; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 2060; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 2061; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2062; GFX908-NEXT: s_cbranch_execnz .LBB10_3 2063; GFX908-NEXT: ; %bb.1: ; %Flow2 2064; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2065; GFX908-NEXT: s_cbranch_execnz .LBB10_8 2066; GFX908-NEXT: .LBB10_2: ; %atomicrmw.phi 2067; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2068; GFX908-NEXT: s_waitcnt vmcnt(0) 2069; GFX908-NEXT: s_setpc_b64 s[30:31] 2070; GFX908-NEXT: .LBB10_3: ; %atomicrmw.check.private 2071; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 2072; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 2073; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 2074; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2075; GFX908-NEXT: s_cbranch_execz .LBB10_5 2076; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 2077; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 2078; GFX908-NEXT: s_waitcnt vmcnt(0) 2079; GFX908-NEXT: buffer_wbinvl1 2080; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 2081; GFX908-NEXT: ; implicit-def: $vgpr2 2082; GFX908-NEXT: .LBB10_5: ; %Flow 2083; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2084; GFX908-NEXT: s_cbranch_execz .LBB10_7 2085; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 2086; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2087; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2088; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 2089; GFX908-NEXT: s_waitcnt vmcnt(0) 2090; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 2091; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 2092; GFX908-NEXT: .LBB10_7: ; %Flow1 2093; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 2094; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 2095; GFX908-NEXT: ; implicit-def: $vgpr2 2096; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2097; GFX908-NEXT: s_cbranch_execz .LBB10_2 2098; GFX908-NEXT: .LBB10_8: ; %atomicrmw.shared 2099; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2100; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2101; GFX908-NEXT: ds_add_f32 v0, v2 2102; GFX908-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2104; GFX908-NEXT: s_waitcnt vmcnt(0) 2105; GFX908-NEXT: s_setpc_b64 s[30:31] 2106; 2107; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 2108; GFX8: ; %bb.0: 2109; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2110; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 2111; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2112; GFX8-NEXT: flat_load_dword v4, v[0:1] 2113; GFX8-NEXT: s_mov_b64 s[4:5], 0 2114; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start 2115; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2116; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2117; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 2118; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2119; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2120; GFX8-NEXT: buffer_wbinvl1 2121; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2122; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2123; GFX8-NEXT: v_mov_b32_e32 v4, v3 2124; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2125; GFX8-NEXT: s_cbranch_execnz .LBB10_1 2126; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2127; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2128; GFX8-NEXT: s_setpc_b64 s[30:31] 2129; 2130; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 2131; GFX7: ; %bb.0: 2132; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2133; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2134; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2135; GFX7-NEXT: flat_load_dword v4, v[0:1] 2136; GFX7-NEXT: s_mov_b64 s[4:5], 0 2137; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start 2138; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2139; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2140; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 2141; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2142; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2143; GFX7-NEXT: buffer_wbinvl1 2144; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2145; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2146; GFX7-NEXT: v_mov_b32_e32 v4, v3 2147; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2148; GFX7-NEXT: s_cbranch_execnz .LBB10_1 2149; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2150; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2151; GFX7-NEXT: s_setpc_b64 s[30:31] 2152 %gep = getelementptr float, ptr %ptr, i64 511 2153 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 2154 ret void 2155} 2156 2157define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 2158; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2159; GFX12: ; %bb.0: 2160; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2161; GFX12-NEXT: s_wait_expcnt 0x0 2162; GFX12-NEXT: s_wait_samplecnt 0x0 2163; GFX12-NEXT: s_wait_bvhcnt 0x0 2164; GFX12-NEXT: s_wait_kmcnt 0x0 2165; GFX12-NEXT: s_wait_storecnt 0x0 2166; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 2167; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2168; GFX12-NEXT: global_inv scope:SCOPE_DEV 2169; GFX12-NEXT: s_setpc_b64 s[30:31] 2170; 2171; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2172; GFX940: ; %bb.0: 2173; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2174; GFX940-NEXT: buffer_wbl2 sc1 2175; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 2176; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2177; GFX940-NEXT: buffer_inv sc1 2178; GFX940-NEXT: s_setpc_b64 s[30:31] 2179; 2180; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2181; GFX11: ; %bb.0: 2182; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2183; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 2184; GFX11-NEXT: s_mov_b32 s0, 0 2185; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 2186; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2187; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2188; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 2189; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2190; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 2191; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2192; GFX11-NEXT: buffer_gl1_inv 2193; GFX11-NEXT: buffer_gl0_inv 2194; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2195; GFX11-NEXT: v_mov_b32_e32 v4, v3 2196; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2197; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2198; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2199; GFX11-NEXT: s_cbranch_execnz .LBB11_1 2200; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2201; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2202; GFX11-NEXT: s_setpc_b64 s[30:31] 2203; 2204; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2205; GFX10: ; %bb.0: 2206; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2207; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 2208; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2209; GFX10-NEXT: s_mov_b32 s4, 0 2210; GFX10-NEXT: flat_load_dword v4, v[0:1] 2211; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start 2212; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2213; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2214; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 2215; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2216; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2217; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2218; GFX10-NEXT: buffer_gl1_inv 2219; GFX10-NEXT: buffer_gl0_inv 2220; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2221; GFX10-NEXT: v_mov_b32_e32 v4, v3 2222; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2223; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2224; GFX10-NEXT: s_cbranch_execnz .LBB11_1 2225; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2226; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2227; GFX10-NEXT: s_setpc_b64 s[30:31] 2228; 2229; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2230; GFX90A: ; %bb.0: 2231; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2232; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 2233; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2234; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start 2235; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2236; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2237; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 2238; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 2239; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2240; GFX90A-NEXT: buffer_wbinvl1 2241; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2242; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2243; GFX90A-NEXT: v_mov_b32_e32 v5, v3 2244; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2245; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 2246; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2247; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2248; GFX90A-NEXT: s_setpc_b64 s[30:31] 2249; 2250; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2251; GFX908: ; %bb.0: 2252; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2253; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 2254; GFX908-NEXT: s_mov_b64 s[4:5], 0 2255; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 2256; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2257; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2258; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 2259; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 2260; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2261; GFX908-NEXT: buffer_wbinvl1 2262; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2263; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2264; GFX908-NEXT: v_mov_b32_e32 v4, v3 2265; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2266; GFX908-NEXT: s_cbranch_execnz .LBB11_1 2267; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2268; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2269; GFX908-NEXT: s_setpc_b64 s[30:31] 2270; 2271; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2272; GFX8: ; %bb.0: 2273; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2274; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 2275; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2276; GFX8-NEXT: flat_load_dword v4, v[0:1] 2277; GFX8-NEXT: s_mov_b64 s[4:5], 0 2278; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 2279; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2280; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2281; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 2282; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2283; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2284; GFX8-NEXT: buffer_wbinvl1 2285; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2286; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2287; GFX8-NEXT: v_mov_b32_e32 v4, v3 2288; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2289; GFX8-NEXT: s_cbranch_execnz .LBB11_1 2290; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2291; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2292; GFX8-NEXT: s_setpc_b64 s[30:31] 2293; 2294; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: 2295; GFX7: ; %bb.0: 2296; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2297; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2298; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2299; GFX7-NEXT: flat_load_dword v4, v[0:1] 2300; GFX7-NEXT: s_mov_b64 s[4:5], 0 2301; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start 2302; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2303; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2304; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 2305; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2306; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2307; GFX7-NEXT: buffer_wbinvl1 2308; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2309; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2310; GFX7-NEXT: v_mov_b32_e32 v4, v3 2311; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2312; GFX7-NEXT: s_cbranch_execnz .LBB11_1 2313; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2314; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2315; GFX7-NEXT: s_setpc_b64 s[30:31] 2316 %gep = getelementptr float, ptr %ptr, i64 511 2317 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 2318 ret void 2319} 2320 2321; -------------------------------------------------------------------- 2322; float with ftz/daz 2323; -------------------------------------------------------------------- 2324 2325define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2326; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2327; GFX12: ; %bb.0: 2328; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2329; GFX12-NEXT: s_wait_expcnt 0x0 2330; GFX12-NEXT: s_wait_samplecnt 0x0 2331; GFX12-NEXT: s_wait_bvhcnt 0x0 2332; GFX12-NEXT: s_wait_kmcnt 0x0 2333; GFX12-NEXT: s_wait_storecnt 0x0 2334; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2335; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2336; GFX12-NEXT: global_inv scope:SCOPE_DEV 2337; GFX12-NEXT: s_setpc_b64 s[30:31] 2338; 2339; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2340; GFX940: ; %bb.0: 2341; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2342; GFX940-NEXT: buffer_wbl2 sc1 2343; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 2344; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2345; GFX940-NEXT: buffer_inv sc1 2346; GFX940-NEXT: s_setpc_b64 s[30:31] 2347; 2348; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2349; GFX11: ; %bb.0: 2350; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2351; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2352; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 2353; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2354; GFX11-NEXT: buffer_gl1_inv 2355; GFX11-NEXT: buffer_gl0_inv 2356; GFX11-NEXT: s_setpc_b64 s[30:31] 2357; 2358; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2359; GFX10: ; %bb.0: 2360; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2361; GFX10-NEXT: flat_load_dword v3, v[0:1] 2362; GFX10-NEXT: s_mov_b32 s4, 0 2363; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start 2364; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2365; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2366; GFX10-NEXT: v_mov_b32_e32 v4, v3 2367; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 2368; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2369; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2370; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2371; GFX10-NEXT: buffer_gl1_inv 2372; GFX10-NEXT: buffer_gl0_inv 2373; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2374; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2375; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2376; GFX10-NEXT: s_cbranch_execnz .LBB12_1 2377; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2378; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2379; GFX10-NEXT: v_mov_b32_e32 v0, v3 2380; GFX10-NEXT: s_setpc_b64 s[30:31] 2381; 2382; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2383; GFX90A: ; %bb.0: 2384; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2385; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 2386; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 2387; GFX90A-NEXT: ; implicit-def: $vgpr3 2388; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2389; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2390; GFX90A-NEXT: s_cbranch_execz .LBB12_6 2391; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private 2392; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 2393; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 2394; GFX90A-NEXT: ; implicit-def: $vgpr3 2395; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 2396; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2397; GFX90A-NEXT: s_cbranch_execz .LBB12_3 2398; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global 2399; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc 2400; GFX90A-NEXT: s_waitcnt vmcnt(0) 2401; GFX90A-NEXT: buffer_wbinvl1 2402; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2403; GFX90A-NEXT: ; implicit-def: $vgpr2 2404; GFX90A-NEXT: .LBB12_3: ; %Flow 2405; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2406; GFX90A-NEXT: s_cbranch_execz .LBB12_5 2407; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private 2408; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2409; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2410; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 2411; GFX90A-NEXT: s_waitcnt vmcnt(0) 2412; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 2413; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 2414; GFX90A-NEXT: .LBB12_5: ; %Flow1 2415; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 2416; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2417; GFX90A-NEXT: ; implicit-def: $vgpr2 2418; GFX90A-NEXT: .LBB12_6: ; %Flow2 2419; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2420; GFX90A-NEXT: s_cbranch_execz .LBB12_8 2421; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared 2422; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2423; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2424; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 2425; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2426; GFX90A-NEXT: .LBB12_8: ; %atomicrmw.phi 2427; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2428; GFX90A-NEXT: v_mov_b32_e32 v0, v3 2429; GFX90A-NEXT: s_waitcnt vmcnt(0) 2430; GFX90A-NEXT: s_setpc_b64 s[30:31] 2431; 2432; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2433; GFX908: ; %bb.0: 2434; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2435; GFX908-NEXT: flat_load_dword v3, v[0:1] 2436; GFX908-NEXT: s_mov_b64 s[4:5], 0 2437; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start 2438; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2439; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2440; GFX908-NEXT: v_mov_b32_e32 v4, v3 2441; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 2442; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2443; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2444; GFX908-NEXT: buffer_wbinvl1 2445; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2446; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2447; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2448; GFX908-NEXT: s_cbranch_execnz .LBB12_1 2449; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2450; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2451; GFX908-NEXT: v_mov_b32_e32 v0, v3 2452; GFX908-NEXT: s_setpc_b64 s[30:31] 2453; 2454; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2455; GFX8: ; %bb.0: 2456; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2457; GFX8-NEXT: flat_load_dword v3, v[0:1] 2458; GFX8-NEXT: s_mov_b64 s[4:5], 0 2459; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start 2460; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2461; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2462; GFX8-NEXT: v_mov_b32_e32 v4, v3 2463; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 2464; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2465; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2466; GFX8-NEXT: buffer_wbinvl1 2467; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2468; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2469; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2470; GFX8-NEXT: s_cbranch_execnz .LBB12_1 2471; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2472; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2473; GFX8-NEXT: v_mov_b32_e32 v0, v3 2474; GFX8-NEXT: s_setpc_b64 s[30:31] 2475; 2476; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: 2477; GFX7: ; %bb.0: 2478; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2479; GFX7-NEXT: flat_load_dword v3, v[0:1] 2480; GFX7-NEXT: s_mov_b64 s[4:5], 0 2481; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start 2482; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2483; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2484; GFX7-NEXT: v_mov_b32_e32 v4, v3 2485; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 2486; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2487; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2488; GFX7-NEXT: buffer_wbinvl1 2489; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2490; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2491; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2492; GFX7-NEXT: s_cbranch_execnz .LBB12_1 2493; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2494; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2495; GFX7-NEXT: v_mov_b32_e32 v0, v3 2496; GFX7-NEXT: s_setpc_b64 s[30:31] 2497 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2498 ret float %result 2499} 2500 2501define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2502; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2503; GFX12: ; %bb.0: 2504; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2505; GFX12-NEXT: s_wait_expcnt 0x0 2506; GFX12-NEXT: s_wait_samplecnt 0x0 2507; GFX12-NEXT: s_wait_bvhcnt 0x0 2508; GFX12-NEXT: s_wait_kmcnt 0x0 2509; GFX12-NEXT: s_wait_storecnt 0x0 2510; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2511; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2512; GFX12-NEXT: global_inv scope:SCOPE_DEV 2513; GFX12-NEXT: s_setpc_b64 s[30:31] 2514; 2515; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2516; GFX940: ; %bb.0: 2517; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2518; GFX940-NEXT: buffer_wbl2 sc1 2519; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 2520; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2521; GFX940-NEXT: buffer_inv sc1 2522; GFX940-NEXT: s_setpc_b64 s[30:31] 2523; 2524; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2525; GFX11: ; %bb.0: 2526; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2527; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2528; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc 2529; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2530; GFX11-NEXT: buffer_gl1_inv 2531; GFX11-NEXT: buffer_gl0_inv 2532; GFX11-NEXT: s_setpc_b64 s[30:31] 2533; 2534; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2535; GFX10: ; %bb.0: 2536; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2537; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 2538; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 2539; GFX10-NEXT: s_mov_b32 s4, 0 2540; GFX10-NEXT: flat_load_dword v0, v[3:4] 2541; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start 2542; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2543; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2544; GFX10-NEXT: v_mov_b32_e32 v1, v0 2545; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 2546; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2547; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2548; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2549; GFX10-NEXT: buffer_gl1_inv 2550; GFX10-NEXT: buffer_gl0_inv 2551; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 2552; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2553; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2554; GFX10-NEXT: s_cbranch_execnz .LBB13_1 2555; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2556; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2557; GFX10-NEXT: s_setpc_b64 s[30:31] 2558; 2559; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2560; GFX90A: ; %bb.0: 2561; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 2563; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 2564; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 2565; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 2566; GFX90A-NEXT: ; implicit-def: $vgpr0 2567; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2568; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2569; GFX90A-NEXT: s_cbranch_execnz .LBB13_3 2570; GFX90A-NEXT: ; %bb.1: ; %Flow2 2571; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2572; GFX90A-NEXT: s_cbranch_execnz .LBB13_8 2573; GFX90A-NEXT: .LBB13_2: ; %atomicrmw.phi 2574; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2575; GFX90A-NEXT: s_waitcnt vmcnt(0) 2576; GFX90A-NEXT: s_setpc_b64 s[30:31] 2577; GFX90A-NEXT: .LBB13_3: ; %atomicrmw.check.private 2578; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 2579; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 2580; GFX90A-NEXT: ; implicit-def: $vgpr0 2581; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 2582; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2583; GFX90A-NEXT: s_cbranch_execz .LBB13_5 2584; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 2585; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 2586; GFX90A-NEXT: s_waitcnt vmcnt(0) 2587; GFX90A-NEXT: buffer_wbinvl1 2588; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 2589; GFX90A-NEXT: ; implicit-def: $vgpr2 2590; GFX90A-NEXT: .LBB13_5: ; %Flow 2591; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2592; GFX90A-NEXT: s_cbranch_execz .LBB13_7 2593; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 2594; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2595; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 2596; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 2597; GFX90A-NEXT: s_waitcnt vmcnt(0) 2598; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 2599; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 2600; GFX90A-NEXT: .LBB13_7: ; %Flow1 2601; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 2602; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 2603; GFX90A-NEXT: ; implicit-def: $vgpr2 2604; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2605; GFX90A-NEXT: s_cbranch_execz .LBB13_2 2606; GFX90A-NEXT: .LBB13_8: ; %atomicrmw.shared 2607; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2608; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 2609; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 2610; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2611; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2612; GFX90A-NEXT: s_waitcnt vmcnt(0) 2613; GFX90A-NEXT: s_setpc_b64 s[30:31] 2614; 2615; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2616; GFX908: ; %bb.0: 2617; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2618; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 2619; GFX908-NEXT: s_mov_b64 s[4:5], 0 2620; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 2621; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2622; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2623; GFX908-NEXT: v_mov_b32_e32 v4, v3 2624; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 2625; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 2626; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2627; GFX908-NEXT: buffer_wbinvl1 2628; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2629; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2630; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2631; GFX908-NEXT: s_cbranch_execnz .LBB13_1 2632; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2633; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2634; GFX908-NEXT: v_mov_b32_e32 v0, v3 2635; GFX908-NEXT: s_setpc_b64 s[30:31] 2636; 2637; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2638; GFX8: ; %bb.0: 2639; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2640; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 2641; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2642; GFX8-NEXT: flat_load_dword v0, v[3:4] 2643; GFX8-NEXT: s_mov_b64 s[4:5], 0 2644; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 2645; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2646; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2647; GFX8-NEXT: v_mov_b32_e32 v1, v0 2648; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 2649; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2650; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2651; GFX8-NEXT: buffer_wbinvl1 2652; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2653; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2654; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2655; GFX8-NEXT: s_cbranch_execnz .LBB13_1 2656; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2657; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2658; GFX8-NEXT: s_setpc_b64 s[30:31] 2659; 2660; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2661; GFX7: ; %bb.0: 2662; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2663; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 2664; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2665; GFX7-NEXT: flat_load_dword v0, v[3:4] 2666; GFX7-NEXT: s_mov_b64 s[4:5], 0 2667; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start 2668; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2669; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2670; GFX7-NEXT: v_mov_b32_e32 v1, v0 2671; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 2672; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2673; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2674; GFX7-NEXT: buffer_wbinvl1 2675; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2676; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2677; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2678; GFX7-NEXT: s_cbranch_execnz .LBB13_1 2679; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2680; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2681; GFX7-NEXT: s_setpc_b64 s[30:31] 2682 %gep = getelementptr float, ptr %ptr, i64 511 2683 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2684 ret float %result 2685} 2686 2687define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2688; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2689; GFX12: ; %bb.0: 2690; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2691; GFX12-NEXT: s_wait_expcnt 0x0 2692; GFX12-NEXT: s_wait_samplecnt 0x0 2693; GFX12-NEXT: s_wait_bvhcnt 0x0 2694; GFX12-NEXT: s_wait_kmcnt 0x0 2695; GFX12-NEXT: s_wait_storecnt 0x0 2696; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2697; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2698; GFX12-NEXT: global_inv scope:SCOPE_DEV 2699; GFX12-NEXT: s_setpc_b64 s[30:31] 2700; 2701; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2702; GFX940: ; %bb.0: 2703; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2704; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2705; GFX940-NEXT: s_nop 1 2706; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2707; GFX940-NEXT: buffer_wbl2 sc1 2708; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 2709; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2710; GFX940-NEXT: buffer_inv sc1 2711; GFX940-NEXT: s_setpc_b64 s[30:31] 2712; 2713; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2714; GFX11: ; %bb.0: 2715; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2716; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 2717; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 2718; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2719; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 2720; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2721; GFX11-NEXT: buffer_gl1_inv 2722; GFX11-NEXT: buffer_gl0_inv 2723; GFX11-NEXT: s_setpc_b64 s[30:31] 2724; 2725; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2726; GFX10: ; %bb.0: 2727; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2728; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 2729; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 2730; GFX10-NEXT: s_mov_b32 s4, 0 2731; GFX10-NEXT: flat_load_dword v0, v[3:4] 2732; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start 2733; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2734; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2735; GFX10-NEXT: v_mov_b32_e32 v1, v0 2736; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 2737; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2738; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2739; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2740; GFX10-NEXT: buffer_gl1_inv 2741; GFX10-NEXT: buffer_gl0_inv 2742; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 2743; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2744; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2745; GFX10-NEXT: s_cbranch_execnz .LBB14_1 2746; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2747; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2748; GFX10-NEXT: s_setpc_b64 s[30:31] 2749; 2750; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2751; GFX90A: ; %bb.0: 2752; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2753; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2754; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2755; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 2756; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 2757; GFX90A-NEXT: ; implicit-def: $vgpr0 2758; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2759; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2760; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 2761; GFX90A-NEXT: ; %bb.1: ; %Flow2 2762; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2763; GFX90A-NEXT: s_cbranch_execnz .LBB14_8 2764; GFX90A-NEXT: .LBB14_2: ; %atomicrmw.phi 2765; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2766; GFX90A-NEXT: s_waitcnt vmcnt(0) 2767; GFX90A-NEXT: s_setpc_b64 s[30:31] 2768; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.check.private 2769; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 2770; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 2771; GFX90A-NEXT: ; implicit-def: $vgpr0 2772; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 2773; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2774; GFX90A-NEXT: s_cbranch_execz .LBB14_5 2775; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 2776; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 2777; GFX90A-NEXT: s_waitcnt vmcnt(0) 2778; GFX90A-NEXT: buffer_wbinvl1 2779; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 2780; GFX90A-NEXT: ; implicit-def: $vgpr2 2781; GFX90A-NEXT: .LBB14_5: ; %Flow 2782; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2783; GFX90A-NEXT: s_cbranch_execz .LBB14_7 2784; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 2785; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2786; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 2787; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 2788; GFX90A-NEXT: s_waitcnt vmcnt(0) 2789; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 2790; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 2791; GFX90A-NEXT: .LBB14_7: ; %Flow1 2792; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 2793; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 2794; GFX90A-NEXT: ; implicit-def: $vgpr2 2795; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2796; GFX90A-NEXT: s_cbranch_execz .LBB14_2 2797; GFX90A-NEXT: .LBB14_8: ; %atomicrmw.shared 2798; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2799; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 2800; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 2801; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2802; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2803; GFX90A-NEXT: s_waitcnt vmcnt(0) 2804; GFX90A-NEXT: s_setpc_b64 s[30:31] 2805; 2806; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2807; GFX908: ; %bb.0: 2808; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2809; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 2810; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 2811; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2812; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2813; GFX908-NEXT: flat_load_dword v0, v[0:1] 2814; GFX908-NEXT: s_mov_b64 s[4:5], 0 2815; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 2816; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2817; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2818; GFX908-NEXT: v_mov_b32_e32 v1, v0 2819; GFX908-NEXT: v_add_f32_e32 v0, v1, v2 2820; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2821; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2822; GFX908-NEXT: buffer_wbinvl1 2823; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2824; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2825; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2826; GFX908-NEXT: s_cbranch_execnz .LBB14_1 2827; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2828; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2829; GFX908-NEXT: s_setpc_b64 s[30:31] 2830; 2831; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2832; GFX8: ; %bb.0: 2833; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2834; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 2835; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 2836; GFX8-NEXT: flat_load_dword v0, v[3:4] 2837; GFX8-NEXT: s_mov_b64 s[4:5], 0 2838; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 2839; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2840; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2841; GFX8-NEXT: v_mov_b32_e32 v1, v0 2842; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 2843; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2844; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2845; GFX8-NEXT: buffer_wbinvl1 2846; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2847; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2848; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2849; GFX8-NEXT: s_cbranch_execnz .LBB14_1 2850; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2851; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2852; GFX8-NEXT: s_setpc_b64 s[30:31] 2853; 2854; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2855; GFX7: ; %bb.0: 2856; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2857; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 2858; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 2859; GFX7-NEXT: flat_load_dword v0, v[3:4] 2860; GFX7-NEXT: s_mov_b64 s[4:5], 0 2861; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start 2862; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2863; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2864; GFX7-NEXT: v_mov_b32_e32 v1, v0 2865; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 2866; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2867; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2868; GFX7-NEXT: buffer_wbinvl1 2869; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2870; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2871; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2872; GFX7-NEXT: s_cbranch_execnz .LBB14_1 2873; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2874; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2875; GFX7-NEXT: s_setpc_b64 s[30:31] 2876 %gep = getelementptr float, ptr %ptr, i64 -512 2877 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2878 ret float %result 2879} 2880 2881define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2882; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2883; GFX12: ; %bb.0: 2884; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2885; GFX12-NEXT: s_wait_expcnt 0x0 2886; GFX12-NEXT: s_wait_samplecnt 0x0 2887; GFX12-NEXT: s_wait_bvhcnt 0x0 2888; GFX12-NEXT: s_wait_kmcnt 0x0 2889; GFX12-NEXT: s_wait_storecnt 0x0 2890; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 2891; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2892; GFX12-NEXT: global_inv scope:SCOPE_DEV 2893; GFX12-NEXT: s_setpc_b64 s[30:31] 2894; 2895; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2896; GFX940: ; %bb.0: 2897; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2898; GFX940-NEXT: buffer_wbl2 sc1 2899; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 2900; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2901; GFX940-NEXT: buffer_inv sc1 2902; GFX940-NEXT: s_setpc_b64 s[30:31] 2903; 2904; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2905; GFX11: ; %bb.0: 2906; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2907; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2908; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 2909; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2911; GFX11-NEXT: buffer_gl1_inv 2912; GFX11-NEXT: buffer_gl0_inv 2913; GFX11-NEXT: s_setpc_b64 s[30:31] 2914; 2915; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2916; GFX10: ; %bb.0: 2917; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2918; GFX10-NEXT: flat_load_dword v4, v[0:1] 2919; GFX10-NEXT: s_mov_b32 s4, 0 2920; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start 2921; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2922; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2923; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 2924; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2925; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2926; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2927; GFX10-NEXT: buffer_gl1_inv 2928; GFX10-NEXT: buffer_gl0_inv 2929; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2930; GFX10-NEXT: v_mov_b32_e32 v4, v3 2931; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2932; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2933; GFX10-NEXT: s_cbranch_execnz .LBB15_1 2934; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2935; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2936; GFX10-NEXT: s_setpc_b64 s[30:31] 2937; 2938; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2939; GFX90A: ; %bb.0: 2940; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2941; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 2942; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 2943; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2944; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2945; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 2946; GFX90A-NEXT: ; %bb.1: ; %Flow2 2947; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2948; GFX90A-NEXT: s_cbranch_execnz .LBB15_8 2949; GFX90A-NEXT: .LBB15_2: ; %atomicrmw.phi 2950; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2951; GFX90A-NEXT: s_waitcnt vmcnt(0) 2952; GFX90A-NEXT: s_setpc_b64 s[30:31] 2953; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.check.private 2954; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 2955; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 2956; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 2957; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 2958; GFX90A-NEXT: s_cbranch_execz .LBB15_5 2959; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 2960; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 2961; GFX90A-NEXT: s_waitcnt vmcnt(0) 2962; GFX90A-NEXT: buffer_wbinvl1 2963; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2964; GFX90A-NEXT: ; implicit-def: $vgpr2 2965; GFX90A-NEXT: .LBB15_5: ; %Flow 2966; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 2967; GFX90A-NEXT: s_cbranch_execz .LBB15_7 2968; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 2969; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2970; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2971; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 2972; GFX90A-NEXT: s_waitcnt vmcnt(0) 2973; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 2974; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 2975; GFX90A-NEXT: .LBB15_7: ; %Flow1 2976; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 2977; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2978; GFX90A-NEXT: ; implicit-def: $vgpr2 2979; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2980; GFX90A-NEXT: s_cbranch_execz .LBB15_2 2981; GFX90A-NEXT: .LBB15_8: ; %atomicrmw.shared 2982; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 2983; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 2984; GFX90A-NEXT: ds_add_f32 v0, v2 2985; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2986; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2987; GFX90A-NEXT: s_waitcnt vmcnt(0) 2988; GFX90A-NEXT: s_setpc_b64 s[30:31] 2989; 2990; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2991; GFX908: ; %bb.0: 2992; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2993; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 2994; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 2995; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 2996; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2997; GFX908-NEXT: s_cbranch_execnz .LBB15_3 2998; GFX908-NEXT: ; %bb.1: ; %Flow2 2999; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3000; GFX908-NEXT: s_cbranch_execnz .LBB15_8 3001; GFX908-NEXT: .LBB15_2: ; %atomicrmw.phi 3002; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3003; GFX908-NEXT: s_waitcnt vmcnt(0) 3004; GFX908-NEXT: s_setpc_b64 s[30:31] 3005; GFX908-NEXT: .LBB15_3: ; %atomicrmw.check.private 3006; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 3007; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3008; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 3009; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3010; GFX908-NEXT: s_cbranch_execz .LBB15_5 3011; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 3012; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 3013; GFX908-NEXT: s_waitcnt vmcnt(0) 3014; GFX908-NEXT: buffer_wbinvl1 3015; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3016; GFX908-NEXT: ; implicit-def: $vgpr2 3017; GFX908-NEXT: .LBB15_5: ; %Flow 3018; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3019; GFX908-NEXT: s_cbranch_execz .LBB15_7 3020; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 3021; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3022; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3023; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3024; GFX908-NEXT: s_waitcnt vmcnt(0) 3025; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 3026; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3027; GFX908-NEXT: .LBB15_7: ; %Flow1 3028; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3029; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3030; GFX908-NEXT: ; implicit-def: $vgpr2 3031; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3032; GFX908-NEXT: s_cbranch_execz .LBB15_2 3033; GFX908-NEXT: .LBB15_8: ; %atomicrmw.shared 3034; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3035; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3036; GFX908-NEXT: ds_add_f32 v0, v2 3037; GFX908-NEXT: s_waitcnt lgkmcnt(0) 3038; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3039; GFX908-NEXT: s_waitcnt vmcnt(0) 3040; GFX908-NEXT: s_setpc_b64 s[30:31] 3041; 3042; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 3043; GFX8: ; %bb.0: 3044; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3045; GFX8-NEXT: flat_load_dword v4, v[0:1] 3046; GFX8-NEXT: s_mov_b64 s[4:5], 0 3047; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start 3048; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3049; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3050; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 3051; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3052; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3053; GFX8-NEXT: buffer_wbinvl1 3054; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3055; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3056; GFX8-NEXT: v_mov_b32_e32 v4, v3 3057; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3058; GFX8-NEXT: s_cbranch_execnz .LBB15_1 3059; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3060; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3061; GFX8-NEXT: s_setpc_b64 s[30:31] 3062; 3063; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: 3064; GFX7: ; %bb.0: 3065; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3066; GFX7-NEXT: flat_load_dword v4, v[0:1] 3067; GFX7-NEXT: s_mov_b64 s[4:5], 0 3068; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start 3069; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3070; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3071; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 3072; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3073; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3074; GFX7-NEXT: buffer_wbinvl1 3075; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3076; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3077; GFX7-NEXT: v_mov_b32_e32 v4, v3 3078; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3079; GFX7-NEXT: s_cbranch_execnz .LBB15_1 3080; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3081; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3082; GFX7-NEXT: s_setpc_b64 s[30:31] 3083 %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3084 ret void 3085} 3086 3087define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 3088; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3089; GFX12: ; %bb.0: 3090; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3091; GFX12-NEXT: s_wait_expcnt 0x0 3092; GFX12-NEXT: s_wait_samplecnt 0x0 3093; GFX12-NEXT: s_wait_bvhcnt 0x0 3094; GFX12-NEXT: s_wait_kmcnt 0x0 3095; GFX12-NEXT: s_wait_storecnt 0x0 3096; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 3097; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 3098; GFX12-NEXT: global_inv scope:SCOPE_DEV 3099; GFX12-NEXT: s_setpc_b64 s[30:31] 3100; 3101; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3102; GFX940: ; %bb.0: 3103; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3104; GFX940-NEXT: buffer_wbl2 sc1 3105; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 3106; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3107; GFX940-NEXT: buffer_inv sc1 3108; GFX940-NEXT: s_setpc_b64 s[30:31] 3109; 3110; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3111; GFX11: ; %bb.0: 3112; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3113; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3114; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 3115; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3116; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3117; GFX11-NEXT: buffer_gl1_inv 3118; GFX11-NEXT: buffer_gl0_inv 3119; GFX11-NEXT: s_setpc_b64 s[30:31] 3120; 3121; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3122; GFX10: ; %bb.0: 3123; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3124; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 3125; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 3126; GFX10-NEXT: s_mov_b32 s4, 0 3127; GFX10-NEXT: flat_load_dword v4, v[0:1] 3128; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start 3129; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3130; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3131; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 3132; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3133; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3134; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3135; GFX10-NEXT: buffer_gl1_inv 3136; GFX10-NEXT: buffer_gl0_inv 3137; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3138; GFX10-NEXT: v_mov_b32_e32 v4, v3 3139; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3140; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3141; GFX10-NEXT: s_cbranch_execnz .LBB16_1 3142; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3143; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3144; GFX10-NEXT: s_setpc_b64 s[30:31] 3145; 3146; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3147; GFX90A: ; %bb.0: 3148; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3149; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 3150; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3151; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 3152; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3153; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3154; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3155; GFX90A-NEXT: s_cbranch_execnz .LBB16_3 3156; GFX90A-NEXT: ; %bb.1: ; %Flow2 3157; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3158; GFX90A-NEXT: s_cbranch_execnz .LBB16_8 3159; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi 3160; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3161; GFX90A-NEXT: s_waitcnt vmcnt(0) 3162; GFX90A-NEXT: s_setpc_b64 s[30:31] 3163; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.check.private 3164; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 3165; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3166; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 3167; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3168; GFX90A-NEXT: s_cbranch_execz .LBB16_5 3169; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 3170; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 3171; GFX90A-NEXT: s_waitcnt vmcnt(0) 3172; GFX90A-NEXT: buffer_wbinvl1 3173; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3174; GFX90A-NEXT: ; implicit-def: $vgpr2 3175; GFX90A-NEXT: .LBB16_5: ; %Flow 3176; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3177; GFX90A-NEXT: s_cbranch_execz .LBB16_7 3178; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 3179; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3180; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3181; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3182; GFX90A-NEXT: s_waitcnt vmcnt(0) 3183; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 3184; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3185; GFX90A-NEXT: .LBB16_7: ; %Flow1 3186; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3187; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3188; GFX90A-NEXT: ; implicit-def: $vgpr2 3189; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3190; GFX90A-NEXT: s_cbranch_execz .LBB16_2 3191; GFX90A-NEXT: .LBB16_8: ; %atomicrmw.shared 3192; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3193; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3194; GFX90A-NEXT: ds_add_f32 v0, v2 3195; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3196; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3197; GFX90A-NEXT: s_waitcnt vmcnt(0) 3198; GFX90A-NEXT: s_setpc_b64 s[30:31] 3199; 3200; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3201; GFX908: ; %bb.0: 3202; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3203; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 3204; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3205; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 3206; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3207; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3208; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3209; GFX908-NEXT: s_cbranch_execnz .LBB16_3 3210; GFX908-NEXT: ; %bb.1: ; %Flow2 3211; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3212; GFX908-NEXT: s_cbranch_execnz .LBB16_8 3213; GFX908-NEXT: .LBB16_2: ; %atomicrmw.phi 3214; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3215; GFX908-NEXT: s_waitcnt vmcnt(0) 3216; GFX908-NEXT: s_setpc_b64 s[30:31] 3217; GFX908-NEXT: .LBB16_3: ; %atomicrmw.check.private 3218; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 3219; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3220; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 3221; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3222; GFX908-NEXT: s_cbranch_execz .LBB16_5 3223; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 3224; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 3225; GFX908-NEXT: s_waitcnt vmcnt(0) 3226; GFX908-NEXT: buffer_wbinvl1 3227; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3228; GFX908-NEXT: ; implicit-def: $vgpr2 3229; GFX908-NEXT: .LBB16_5: ; %Flow 3230; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3231; GFX908-NEXT: s_cbranch_execz .LBB16_7 3232; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 3233; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3234; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3235; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3236; GFX908-NEXT: s_waitcnt vmcnt(0) 3237; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 3238; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3239; GFX908-NEXT: .LBB16_7: ; %Flow1 3240; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3241; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3242; GFX908-NEXT: ; implicit-def: $vgpr2 3243; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3244; GFX908-NEXT: s_cbranch_execz .LBB16_2 3245; GFX908-NEXT: .LBB16_8: ; %atomicrmw.shared 3246; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3247; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3248; GFX908-NEXT: ds_add_f32 v0, v2 3249; GFX908-NEXT: s_waitcnt lgkmcnt(0) 3250; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3251; GFX908-NEXT: s_waitcnt vmcnt(0) 3252; GFX908-NEXT: s_setpc_b64 s[30:31] 3253; 3254; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3255; GFX8: ; %bb.0: 3256; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3257; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 3258; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3259; GFX8-NEXT: flat_load_dword v4, v[0:1] 3260; GFX8-NEXT: s_mov_b64 s[4:5], 0 3261; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start 3262; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3263; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3264; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 3265; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3266; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3267; GFX8-NEXT: buffer_wbinvl1 3268; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3269; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3270; GFX8-NEXT: v_mov_b32_e32 v4, v3 3271; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3272; GFX8-NEXT: s_cbranch_execnz .LBB16_1 3273; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3274; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3275; GFX8-NEXT: s_setpc_b64 s[30:31] 3276; 3277; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3278; GFX7: ; %bb.0: 3279; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3280; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 3281; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3282; GFX7-NEXT: flat_load_dword v4, v[0:1] 3283; GFX7-NEXT: s_mov_b64 s[4:5], 0 3284; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start 3285; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3286; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3287; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 3288; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3289; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3290; GFX7-NEXT: buffer_wbinvl1 3291; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3292; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3293; GFX7-NEXT: v_mov_b32_e32 v4, v3 3294; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3295; GFX7-NEXT: s_cbranch_execnz .LBB16_1 3296; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3297; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3298; GFX7-NEXT: s_setpc_b64 s[30:31] 3299 %gep = getelementptr float, ptr %ptr, i64 511 3300 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3301 ret void 3302} 3303 3304define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 3305; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3306; GFX12: ; %bb.0: 3307; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3308; GFX12-NEXT: s_wait_expcnt 0x0 3309; GFX12-NEXT: s_wait_samplecnt 0x0 3310; GFX12-NEXT: s_wait_bvhcnt 0x0 3311; GFX12-NEXT: s_wait_kmcnt 0x0 3312; GFX12-NEXT: s_wait_storecnt 0x0 3313; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 3314; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 3315; GFX12-NEXT: global_inv scope:SCOPE_DEV 3316; GFX12-NEXT: s_setpc_b64 s[30:31] 3317; 3318; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3319; GFX940: ; %bb.0: 3320; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3321; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 3322; GFX940-NEXT: s_nop 1 3323; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 3324; GFX940-NEXT: buffer_wbl2 sc1 3325; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 3326; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3327; GFX940-NEXT: buffer_inv sc1 3328; GFX940-NEXT: s_setpc_b64 s[30:31] 3329; 3330; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3331; GFX11: ; %bb.0: 3332; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3333; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 3334; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 3335; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3336; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 3337; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3338; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3339; GFX11-NEXT: buffer_gl1_inv 3340; GFX11-NEXT: buffer_gl0_inv 3341; GFX11-NEXT: s_setpc_b64 s[30:31] 3342; 3343; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3344; GFX10: ; %bb.0: 3345; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3346; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 3347; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 3348; GFX10-NEXT: s_mov_b32 s4, 0 3349; GFX10-NEXT: flat_load_dword v4, v[0:1] 3350; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start 3351; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3352; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3353; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 3354; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3355; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3356; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3357; GFX10-NEXT: buffer_gl1_inv 3358; GFX10-NEXT: buffer_gl0_inv 3359; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3360; GFX10-NEXT: v_mov_b32_e32 v4, v3 3361; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3362; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3363; GFX10-NEXT: s_cbranch_execnz .LBB17_1 3364; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3365; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3366; GFX10-NEXT: s_setpc_b64 s[30:31] 3367; 3368; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3369; GFX90A: ; %bb.0: 3370; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3371; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 3372; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 3373; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 3374; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3375; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3376; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3377; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 3378; GFX90A-NEXT: ; %bb.1: ; %Flow2 3379; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3380; GFX90A-NEXT: s_cbranch_execnz .LBB17_8 3381; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi 3382; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3383; GFX90A-NEXT: s_waitcnt vmcnt(0) 3384; GFX90A-NEXT: s_setpc_b64 s[30:31] 3385; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.check.private 3386; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 3387; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3388; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 3389; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3390; GFX90A-NEXT: s_cbranch_execz .LBB17_5 3391; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 3392; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 3393; GFX90A-NEXT: s_waitcnt vmcnt(0) 3394; GFX90A-NEXT: buffer_wbinvl1 3395; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3396; GFX90A-NEXT: ; implicit-def: $vgpr2 3397; GFX90A-NEXT: .LBB17_5: ; %Flow 3398; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3399; GFX90A-NEXT: s_cbranch_execz .LBB17_7 3400; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 3401; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3402; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3403; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3404; GFX90A-NEXT: s_waitcnt vmcnt(0) 3405; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 3406; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3407; GFX90A-NEXT: .LBB17_7: ; %Flow1 3408; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3409; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3410; GFX90A-NEXT: ; implicit-def: $vgpr2 3411; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3412; GFX90A-NEXT: s_cbranch_execz .LBB17_2 3413; GFX90A-NEXT: .LBB17_8: ; %atomicrmw.shared 3414; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3415; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3416; GFX90A-NEXT: ds_add_f32 v0, v2 3417; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3418; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3419; GFX90A-NEXT: s_waitcnt vmcnt(0) 3420; GFX90A-NEXT: s_setpc_b64 s[30:31] 3421; 3422; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3423; GFX908: ; %bb.0: 3424; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3425; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 3426; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 3427; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 3428; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3429; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3430; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3431; GFX908-NEXT: s_cbranch_execnz .LBB17_3 3432; GFX908-NEXT: ; %bb.1: ; %Flow2 3433; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3434; GFX908-NEXT: s_cbranch_execnz .LBB17_8 3435; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi 3436; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3437; GFX908-NEXT: s_waitcnt vmcnt(0) 3438; GFX908-NEXT: s_setpc_b64 s[30:31] 3439; GFX908-NEXT: .LBB17_3: ; %atomicrmw.check.private 3440; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 3441; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3442; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 3443; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3444; GFX908-NEXT: s_cbranch_execz .LBB17_5 3445; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 3446; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 3447; GFX908-NEXT: s_waitcnt vmcnt(0) 3448; GFX908-NEXT: buffer_wbinvl1 3449; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3450; GFX908-NEXT: ; implicit-def: $vgpr2 3451; GFX908-NEXT: .LBB17_5: ; %Flow 3452; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3453; GFX908-NEXT: s_cbranch_execz .LBB17_7 3454; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 3455; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3456; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3457; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3458; GFX908-NEXT: s_waitcnt vmcnt(0) 3459; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 3460; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3461; GFX908-NEXT: .LBB17_7: ; %Flow1 3462; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3463; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3464; GFX908-NEXT: ; implicit-def: $vgpr2 3465; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3466; GFX908-NEXT: s_cbranch_execz .LBB17_2 3467; GFX908-NEXT: .LBB17_8: ; %atomicrmw.shared 3468; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3469; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3470; GFX908-NEXT: ds_add_f32 v0, v2 3471; GFX908-NEXT: s_waitcnt lgkmcnt(0) 3472; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3473; GFX908-NEXT: s_waitcnt vmcnt(0) 3474; GFX908-NEXT: s_setpc_b64 s[30:31] 3475; 3476; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3477; GFX8: ; %bb.0: 3478; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3479; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 3480; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 3481; GFX8-NEXT: flat_load_dword v4, v[0:1] 3482; GFX8-NEXT: s_mov_b64 s[4:5], 0 3483; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start 3484; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3485; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3486; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 3487; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3488; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3489; GFX8-NEXT: buffer_wbinvl1 3490; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3491; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3492; GFX8-NEXT: v_mov_b32_e32 v4, v3 3493; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3494; GFX8-NEXT: s_cbranch_execnz .LBB17_1 3495; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3496; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3497; GFX8-NEXT: s_setpc_b64 s[30:31] 3498; 3499; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 3500; GFX7: ; %bb.0: 3501; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3502; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 3503; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 3504; GFX7-NEXT: flat_load_dword v4, v[0:1] 3505; GFX7-NEXT: s_mov_b64 s[4:5], 0 3506; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start 3507; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3508; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3509; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 3510; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3511; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3512; GFX7-NEXT: buffer_wbinvl1 3513; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3514; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3515; GFX7-NEXT: v_mov_b32_e32 v4, v3 3516; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3517; GFX7-NEXT: s_cbranch_execnz .LBB17_1 3518; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3519; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3520; GFX7-NEXT: s_setpc_b64 s[30:31] 3521 %gep = getelementptr float, ptr %ptr, i64 -512 3522 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3523 ret void 3524} 3525 3526define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 3527; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3528; GFX12: ; %bb.0: 3529; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3530; GFX12-NEXT: s_wait_expcnt 0x0 3531; GFX12-NEXT: s_wait_samplecnt 0x0 3532; GFX12-NEXT: s_wait_bvhcnt 0x0 3533; GFX12-NEXT: s_wait_kmcnt 0x0 3534; GFX12-NEXT: global_wb scope:SCOPE_SYS 3535; GFX12-NEXT: s_wait_storecnt 0x0 3536; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3537; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3538; GFX12-NEXT: global_inv scope:SCOPE_SYS 3539; GFX12-NEXT: s_setpc_b64 s[30:31] 3540; 3541; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3542; GFX940: ; %bb.0: 3543; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3544; GFX940-NEXT: buffer_wbl2 sc0 sc1 3545; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 3546; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3547; GFX940-NEXT: buffer_inv sc0 sc1 3548; GFX940-NEXT: s_setpc_b64 s[30:31] 3549; 3550; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3551; GFX11: ; %bb.0: 3552; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3553; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3554; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc 3555; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3556; GFX11-NEXT: buffer_gl1_inv 3557; GFX11-NEXT: buffer_gl0_inv 3558; GFX11-NEXT: s_setpc_b64 s[30:31] 3559; 3560; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3561; GFX10: ; %bb.0: 3562; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3563; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 3564; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 3565; GFX10-NEXT: s_mov_b32 s4, 0 3566; GFX10-NEXT: flat_load_dword v0, v[3:4] 3567; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start 3568; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3569; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3570; GFX10-NEXT: v_mov_b32_e32 v1, v0 3571; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 3572; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3573; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3574; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3575; GFX10-NEXT: buffer_gl1_inv 3576; GFX10-NEXT: buffer_gl0_inv 3577; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 3578; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3579; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3580; GFX10-NEXT: s_cbranch_execnz .LBB18_1 3581; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3582; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3583; GFX10-NEXT: s_setpc_b64 s[30:31] 3584; 3585; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3586; GFX90A: ; %bb.0: 3587; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3588; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 3589; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3590; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 3591; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3592; GFX90A-NEXT: ; implicit-def: $vgpr0 3593; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3594; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3595; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 3596; GFX90A-NEXT: ; %bb.1: ; %Flow2 3597; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3598; GFX90A-NEXT: s_cbranch_execnz .LBB18_8 3599; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi 3600; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3601; GFX90A-NEXT: s_waitcnt vmcnt(0) 3602; GFX90A-NEXT: s_setpc_b64 s[30:31] 3603; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.check.private 3604; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 3605; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 3606; GFX90A-NEXT: ; implicit-def: $vgpr0 3607; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 3608; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3609; GFX90A-NEXT: s_cbranch_execz .LBB18_5 3610; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 3611; GFX90A-NEXT: buffer_wbl2 3612; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 3613; GFX90A-NEXT: s_waitcnt vmcnt(0) 3614; GFX90A-NEXT: buffer_invl2 3615; GFX90A-NEXT: buffer_wbinvl1 3616; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3617; GFX90A-NEXT: ; implicit-def: $vgpr2 3618; GFX90A-NEXT: .LBB18_5: ; %Flow 3619; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3620; GFX90A-NEXT: s_cbranch_execz .LBB18_7 3621; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 3622; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3623; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 3624; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 3625; GFX90A-NEXT: s_waitcnt vmcnt(0) 3626; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 3627; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 3628; GFX90A-NEXT: .LBB18_7: ; %Flow1 3629; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3630; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3631; GFX90A-NEXT: ; implicit-def: $vgpr2 3632; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3633; GFX90A-NEXT: s_cbranch_execz .LBB18_2 3634; GFX90A-NEXT: .LBB18_8: ; %atomicrmw.shared 3635; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3636; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 3637; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 3638; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3639; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3640; GFX90A-NEXT: s_waitcnt vmcnt(0) 3641; GFX90A-NEXT: s_setpc_b64 s[30:31] 3642; 3643; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3644; GFX908: ; %bb.0: 3645; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3646; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 3647; GFX908-NEXT: s_mov_b64 s[4:5], 0 3648; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start 3649; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3650; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3651; GFX908-NEXT: v_mov_b32_e32 v4, v3 3652; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 3653; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 3654; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3655; GFX908-NEXT: buffer_wbinvl1 3656; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3657; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3658; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3659; GFX908-NEXT: s_cbranch_execnz .LBB18_1 3660; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3661; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3662; GFX908-NEXT: v_mov_b32_e32 v0, v3 3663; GFX908-NEXT: s_setpc_b64 s[30:31] 3664; 3665; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3666; GFX8: ; %bb.0: 3667; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3668; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 3669; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3670; GFX8-NEXT: flat_load_dword v0, v[3:4] 3671; GFX8-NEXT: s_mov_b64 s[4:5], 0 3672; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start 3673; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3674; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3675; GFX8-NEXT: v_mov_b32_e32 v1, v0 3676; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 3677; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3678; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3679; GFX8-NEXT: buffer_wbinvl1 3680; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 3681; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3682; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3683; GFX8-NEXT: s_cbranch_execnz .LBB18_1 3684; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3685; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3686; GFX8-NEXT: s_setpc_b64 s[30:31] 3687; 3688; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3689; GFX7: ; %bb.0: 3690; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3691; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 3692; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3693; GFX7-NEXT: flat_load_dword v0, v[3:4] 3694; GFX7-NEXT: s_mov_b64 s[4:5], 0 3695; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start 3696; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3697; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3698; GFX7-NEXT: v_mov_b32_e32 v1, v0 3699; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 3700; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3701; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3702; GFX7-NEXT: buffer_wbinvl1 3703; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 3704; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3705; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3706; GFX7-NEXT: s_cbranch_execnz .LBB18_1 3707; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3708; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3709; GFX7-NEXT: s_setpc_b64 s[30:31] 3710 %gep = getelementptr float, ptr %ptr, i64 511 3711 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 3712 ret float %result 3713} 3714 3715define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 3716; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3717; GFX12: ; %bb.0: 3718; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3719; GFX12-NEXT: s_wait_expcnt 0x0 3720; GFX12-NEXT: s_wait_samplecnt 0x0 3721; GFX12-NEXT: s_wait_bvhcnt 0x0 3722; GFX12-NEXT: s_wait_kmcnt 0x0 3723; GFX12-NEXT: global_wb scope:SCOPE_SYS 3724; GFX12-NEXT: s_wait_storecnt 0x0 3725; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS 3726; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 3727; GFX12-NEXT: global_inv scope:SCOPE_SYS 3728; GFX12-NEXT: s_setpc_b64 s[30:31] 3729; 3730; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3731; GFX940: ; %bb.0: 3732; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3733; GFX940-NEXT: buffer_wbl2 sc0 sc1 3734; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 3735; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3736; GFX940-NEXT: buffer_inv sc0 sc1 3737; GFX940-NEXT: s_setpc_b64 s[30:31] 3738; 3739; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3740; GFX11: ; %bb.0: 3741; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3742; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3743; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 3744; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3745; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3746; GFX11-NEXT: buffer_gl1_inv 3747; GFX11-NEXT: buffer_gl0_inv 3748; GFX11-NEXT: s_setpc_b64 s[30:31] 3749; 3750; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3751; GFX10: ; %bb.0: 3752; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3753; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 3754; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 3755; GFX10-NEXT: s_mov_b32 s4, 0 3756; GFX10-NEXT: flat_load_dword v4, v[0:1] 3757; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start 3758; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3759; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3760; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 3761; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3762; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3763; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3764; GFX10-NEXT: buffer_gl1_inv 3765; GFX10-NEXT: buffer_gl0_inv 3766; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3767; GFX10-NEXT: v_mov_b32_e32 v4, v3 3768; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3769; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3770; GFX10-NEXT: s_cbranch_execnz .LBB19_1 3771; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3772; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3773; GFX10-NEXT: s_setpc_b64 s[30:31] 3774; 3775; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3776; GFX90A: ; %bb.0: 3777; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3778; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 3779; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3780; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 3781; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3782; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3783; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3784; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 3785; GFX90A-NEXT: ; %bb.1: ; %Flow2 3786; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3787; GFX90A-NEXT: s_cbranch_execnz .LBB19_8 3788; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi 3789; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3790; GFX90A-NEXT: s_waitcnt vmcnt(0) 3791; GFX90A-NEXT: s_setpc_b64 s[30:31] 3792; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.check.private 3793; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 3794; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3795; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 3796; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3797; GFX90A-NEXT: s_cbranch_execz .LBB19_5 3798; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 3799; GFX90A-NEXT: buffer_wbl2 3800; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 3801; GFX90A-NEXT: s_waitcnt vmcnt(0) 3802; GFX90A-NEXT: buffer_invl2 3803; GFX90A-NEXT: buffer_wbinvl1 3804; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3805; GFX90A-NEXT: ; implicit-def: $vgpr2 3806; GFX90A-NEXT: .LBB19_5: ; %Flow 3807; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3808; GFX90A-NEXT: s_cbranch_execz .LBB19_7 3809; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 3810; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3811; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3812; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3813; GFX90A-NEXT: s_waitcnt vmcnt(0) 3814; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 3815; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3816; GFX90A-NEXT: .LBB19_7: ; %Flow1 3817; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3818; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3819; GFX90A-NEXT: ; implicit-def: $vgpr2 3820; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3821; GFX90A-NEXT: s_cbranch_execz .LBB19_2 3822; GFX90A-NEXT: .LBB19_8: ; %atomicrmw.shared 3823; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3824; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3825; GFX90A-NEXT: ds_add_f32 v0, v2 3826; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3827; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3828; GFX90A-NEXT: s_waitcnt vmcnt(0) 3829; GFX90A-NEXT: s_setpc_b64 s[30:31] 3830; 3831; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3832; GFX908: ; %bb.0: 3833; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3834; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 3835; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3836; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 3837; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3838; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3839; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3840; GFX908-NEXT: s_cbranch_execnz .LBB19_3 3841; GFX908-NEXT: ; %bb.1: ; %Flow2 3842; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3843; GFX908-NEXT: s_cbranch_execnz .LBB19_8 3844; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi 3845; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3846; GFX908-NEXT: s_waitcnt vmcnt(0) 3847; GFX908-NEXT: s_setpc_b64 s[30:31] 3848; GFX908-NEXT: .LBB19_3: ; %atomicrmw.check.private 3849; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 3850; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 3851; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 3852; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 3853; GFX908-NEXT: s_cbranch_execz .LBB19_5 3854; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 3855; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 3856; GFX908-NEXT: s_waitcnt vmcnt(0) 3857; GFX908-NEXT: buffer_wbinvl1 3858; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3859; GFX908-NEXT: ; implicit-def: $vgpr2 3860; GFX908-NEXT: .LBB19_5: ; %Flow 3861; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 3862; GFX908-NEXT: s_cbranch_execz .LBB19_7 3863; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 3864; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3865; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3866; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 3867; GFX908-NEXT: s_waitcnt vmcnt(0) 3868; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 3869; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3870; GFX908-NEXT: .LBB19_7: ; %Flow1 3871; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3872; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3873; GFX908-NEXT: ; implicit-def: $vgpr2 3874; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3875; GFX908-NEXT: s_cbranch_execz .LBB19_2 3876; GFX908-NEXT: .LBB19_8: ; %atomicrmw.shared 3877; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3878; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 3879; GFX908-NEXT: ds_add_f32 v0, v2 3880; GFX908-NEXT: s_waitcnt lgkmcnt(0) 3881; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3882; GFX908-NEXT: s_waitcnt vmcnt(0) 3883; GFX908-NEXT: s_setpc_b64 s[30:31] 3884; 3885; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3886; GFX8: ; %bb.0: 3887; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3888; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 3889; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3890; GFX8-NEXT: flat_load_dword v4, v[0:1] 3891; GFX8-NEXT: s_mov_b64 s[4:5], 0 3892; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start 3893; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3894; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3895; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 3896; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3897; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3898; GFX8-NEXT: buffer_wbinvl1 3899; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3900; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3901; GFX8-NEXT: v_mov_b32_e32 v4, v3 3902; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3903; GFX8-NEXT: s_cbranch_execnz .LBB19_1 3904; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3905; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3906; GFX8-NEXT: s_setpc_b64 s[30:31] 3907; 3908; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 3909; GFX7: ; %bb.0: 3910; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3911; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 3912; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3913; GFX7-NEXT: flat_load_dword v4, v[0:1] 3914; GFX7-NEXT: s_mov_b64 s[4:5], 0 3915; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start 3916; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3917; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3918; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 3919; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3920; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3921; GFX7-NEXT: buffer_wbinvl1 3922; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3923; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3924; GFX7-NEXT: v_mov_b32_e32 v4, v3 3925; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3926; GFX7-NEXT: s_cbranch_execnz .LBB19_1 3927; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3928; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3929; GFX7-NEXT: s_setpc_b64 s[30:31] 3930 %gep = getelementptr float, ptr %ptr, i64 511 3931 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 3932 ret void 3933} 3934 3935define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 3936; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 3937; GFX12: ; %bb.0: 3938; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3939; GFX12-NEXT: s_wait_expcnt 0x0 3940; GFX12-NEXT: s_wait_samplecnt 0x0 3941; GFX12-NEXT: s_wait_bvhcnt 0x0 3942; GFX12-NEXT: s_wait_kmcnt 0x0 3943; GFX12-NEXT: global_wb scope:SCOPE_SYS 3944; GFX12-NEXT: s_wait_storecnt 0x0 3945; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3946; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3947; GFX12-NEXT: global_inv scope:SCOPE_SYS 3948; GFX12-NEXT: s_setpc_b64 s[30:31] 3949; 3950; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 3951; GFX940: ; %bb.0: 3952; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3953; GFX940-NEXT: buffer_wbl2 sc0 sc1 3954; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 3955; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3956; GFX940-NEXT: buffer_inv sc0 sc1 3957; GFX940-NEXT: s_setpc_b64 s[30:31] 3958; 3959; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 3960; GFX11: ; %bb.0: 3961; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3962; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3963; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc 3964; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3965; GFX11-NEXT: buffer_gl1_inv 3966; GFX11-NEXT: buffer_gl0_inv 3967; GFX11-NEXT: s_setpc_b64 s[30:31] 3968; 3969; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 3970; GFX10: ; %bb.0: 3971; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3972; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 3973; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 3974; GFX10-NEXT: s_mov_b32 s4, 0 3975; GFX10-NEXT: flat_load_dword v0, v[3:4] 3976; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start 3977; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3978; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3979; GFX10-NEXT: v_mov_b32_e32 v1, v0 3980; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 3981; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3982; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3983; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3984; GFX10-NEXT: buffer_gl1_inv 3985; GFX10-NEXT: buffer_gl0_inv 3986; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 3987; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3988; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3989; GFX10-NEXT: s_cbranch_execnz .LBB20_1 3990; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3991; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3992; GFX10-NEXT: s_setpc_b64 s[30:31] 3993; 3994; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 3995; GFX90A: ; %bb.0: 3996; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3997; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 3998; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3999; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 4000; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 4001; GFX90A-NEXT: ; implicit-def: $vgpr0 4002; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4003; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4004; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 4005; GFX90A-NEXT: ; %bb.1: ; %Flow2 4006; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4007; GFX90A-NEXT: s_cbranch_execnz .LBB20_8 4008; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi 4009; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4010; GFX90A-NEXT: s_waitcnt vmcnt(0) 4011; GFX90A-NEXT: s_setpc_b64 s[30:31] 4012; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.check.private 4013; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 4014; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 4015; GFX90A-NEXT: ; implicit-def: $vgpr0 4016; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 4017; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 4018; GFX90A-NEXT: s_cbranch_execz .LBB20_5 4019; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 4020; GFX90A-NEXT: buffer_wbl2 4021; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc 4022; GFX90A-NEXT: s_waitcnt vmcnt(0) 4023; GFX90A-NEXT: buffer_invl2 4024; GFX90A-NEXT: buffer_wbinvl1 4025; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 4026; GFX90A-NEXT: ; implicit-def: $vgpr2 4027; GFX90A-NEXT: .LBB20_5: ; %Flow 4028; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 4029; GFX90A-NEXT: s_cbranch_execz .LBB20_7 4030; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 4031; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4032; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc 4033; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen 4034; GFX90A-NEXT: s_waitcnt vmcnt(0) 4035; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 4036; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen 4037; GFX90A-NEXT: .LBB20_7: ; %Flow1 4038; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 4039; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 4040; GFX90A-NEXT: ; implicit-def: $vgpr2 4041; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4042; GFX90A-NEXT: s_cbranch_execz .LBB20_2 4043; GFX90A-NEXT: .LBB20_8: ; %atomicrmw.shared 4044; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4045; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc 4046; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 4047; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4048; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4049; GFX90A-NEXT: s_waitcnt vmcnt(0) 4050; GFX90A-NEXT: s_setpc_b64 s[30:31] 4051; 4052; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4053; GFX908: ; %bb.0: 4054; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4055; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 4056; GFX908-NEXT: s_mov_b64 s[4:5], 0 4057; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start 4058; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4059; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4060; GFX908-NEXT: v_mov_b32_e32 v4, v3 4061; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 4062; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 4063; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4064; GFX908-NEXT: buffer_wbinvl1 4065; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4066; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4067; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4068; GFX908-NEXT: s_cbranch_execnz .LBB20_1 4069; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4070; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4071; GFX908-NEXT: v_mov_b32_e32 v0, v3 4072; GFX908-NEXT: s_setpc_b64 s[30:31] 4073; 4074; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4075; GFX8: ; %bb.0: 4076; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4077; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 4078; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4079; GFX8-NEXT: flat_load_dword v0, v[3:4] 4080; GFX8-NEXT: s_mov_b64 s[4:5], 0 4081; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start 4082; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4083; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4084; GFX8-NEXT: v_mov_b32_e32 v1, v0 4085; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 4086; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4087; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4088; GFX8-NEXT: buffer_wbinvl1 4089; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4090; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4091; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4092; GFX8-NEXT: s_cbranch_execnz .LBB20_1 4093; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4094; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4095; GFX8-NEXT: s_setpc_b64 s[30:31] 4096; 4097; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4098; GFX7: ; %bb.0: 4099; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4100; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 4101; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4102; GFX7-NEXT: flat_load_dword v0, v[3:4] 4103; GFX7-NEXT: s_mov_b64 s[4:5], 0 4104; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start 4105; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4106; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4107; GFX7-NEXT: v_mov_b32_e32 v1, v0 4108; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 4109; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 4110; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4111; GFX7-NEXT: buffer_wbinvl1 4112; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 4113; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4114; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4115; GFX7-NEXT: s_cbranch_execnz .LBB20_1 4116; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4117; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4118; GFX7-NEXT: s_setpc_b64 s[30:31] 4119 %gep = getelementptr float, ptr %ptr, i64 511 4120 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 4121 ret float %result 4122} 4123 4124define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 4125; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4126; GFX12: ; %bb.0: 4127; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4128; GFX12-NEXT: s_wait_expcnt 0x0 4129; GFX12-NEXT: s_wait_samplecnt 0x0 4130; GFX12-NEXT: s_wait_bvhcnt 0x0 4131; GFX12-NEXT: s_wait_kmcnt 0x0 4132; GFX12-NEXT: global_wb scope:SCOPE_SYS 4133; GFX12-NEXT: s_wait_storecnt 0x0 4134; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS 4135; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 4136; GFX12-NEXT: global_inv scope:SCOPE_SYS 4137; GFX12-NEXT: s_setpc_b64 s[30:31] 4138; 4139; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4140; GFX940: ; %bb.0: 4141; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4142; GFX940-NEXT: buffer_wbl2 sc0 sc1 4143; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 4144; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4145; GFX940-NEXT: buffer_inv sc0 sc1 4146; GFX940-NEXT: s_setpc_b64 s[30:31] 4147; 4148; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4149; GFX11: ; %bb.0: 4150; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4151; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4152; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 4153; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4154; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4155; GFX11-NEXT: buffer_gl1_inv 4156; GFX11-NEXT: buffer_gl0_inv 4157; GFX11-NEXT: s_setpc_b64 s[30:31] 4158; 4159; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4160; GFX10: ; %bb.0: 4161; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4162; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 4163; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 4164; GFX10-NEXT: s_mov_b32 s4, 0 4165; GFX10-NEXT: flat_load_dword v4, v[0:1] 4166; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start 4167; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4168; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4169; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 4170; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4171; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4172; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4173; GFX10-NEXT: buffer_gl1_inv 4174; GFX10-NEXT: buffer_gl0_inv 4175; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4176; GFX10-NEXT: v_mov_b32_e32 v4, v3 4177; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 4178; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 4179; GFX10-NEXT: s_cbranch_execnz .LBB21_1 4180; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4181; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4182; GFX10-NEXT: s_setpc_b64 s[30:31] 4183; 4184; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4185; GFX90A: ; %bb.0: 4186; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4187; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 4188; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 4189; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 4190; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4191; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4192; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4193; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 4194; GFX90A-NEXT: ; %bb.1: ; %Flow2 4195; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4196; GFX90A-NEXT: s_cbranch_execnz .LBB21_8 4197; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi 4198; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4199; GFX90A-NEXT: s_waitcnt vmcnt(0) 4200; GFX90A-NEXT: s_setpc_b64 s[30:31] 4201; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.check.private 4202; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 4203; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 4204; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 4205; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 4206; GFX90A-NEXT: s_cbranch_execz .LBB21_5 4207; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 4208; GFX90A-NEXT: buffer_wbl2 4209; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 4210; GFX90A-NEXT: s_waitcnt vmcnt(0) 4211; GFX90A-NEXT: buffer_invl2 4212; GFX90A-NEXT: buffer_wbinvl1 4213; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4214; GFX90A-NEXT: ; implicit-def: $vgpr2 4215; GFX90A-NEXT: .LBB21_5: ; %Flow 4216; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 4217; GFX90A-NEXT: s_cbranch_execz .LBB21_7 4218; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 4219; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4220; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 4221; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 4222; GFX90A-NEXT: s_waitcnt vmcnt(0) 4223; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 4224; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 4225; GFX90A-NEXT: .LBB21_7: ; %Flow1 4226; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 4227; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4228; GFX90A-NEXT: ; implicit-def: $vgpr2 4229; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4230; GFX90A-NEXT: s_cbranch_execz .LBB21_2 4231; GFX90A-NEXT: .LBB21_8: ; %atomicrmw.shared 4232; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4233; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 4234; GFX90A-NEXT: ds_add_f32 v0, v2 4235; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4236; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4237; GFX90A-NEXT: s_waitcnt vmcnt(0) 4238; GFX90A-NEXT: s_setpc_b64 s[30:31] 4239; 4240; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4241; GFX908: ; %bb.0: 4242; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4243; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 4244; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 4245; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 4246; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4247; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 4248; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4249; GFX908-NEXT: s_cbranch_execnz .LBB21_3 4250; GFX908-NEXT: ; %bb.1: ; %Flow2 4251; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4252; GFX908-NEXT: s_cbranch_execnz .LBB21_8 4253; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi 4254; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4255; GFX908-NEXT: s_waitcnt vmcnt(0) 4256; GFX908-NEXT: s_setpc_b64 s[30:31] 4257; GFX908-NEXT: .LBB21_3: ; %atomicrmw.check.private 4258; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 4259; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 4260; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 4261; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 4262; GFX908-NEXT: s_cbranch_execz .LBB21_5 4263; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 4264; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 4265; GFX908-NEXT: s_waitcnt vmcnt(0) 4266; GFX908-NEXT: buffer_wbinvl1 4267; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 4268; GFX908-NEXT: ; implicit-def: $vgpr2 4269; GFX908-NEXT: .LBB21_5: ; %Flow 4270; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 4271; GFX908-NEXT: s_cbranch_execz .LBB21_7 4272; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 4273; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4274; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 4275; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 4276; GFX908-NEXT: s_waitcnt vmcnt(0) 4277; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 4278; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 4279; GFX908-NEXT: .LBB21_7: ; %Flow1 4280; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4281; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 4282; GFX908-NEXT: ; implicit-def: $vgpr2 4283; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4284; GFX908-NEXT: s_cbranch_execz .LBB21_2 4285; GFX908-NEXT: .LBB21_8: ; %atomicrmw.shared 4286; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4287; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 4288; GFX908-NEXT: ds_add_f32 v0, v2 4289; GFX908-NEXT: s_waitcnt lgkmcnt(0) 4290; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4291; GFX908-NEXT: s_waitcnt vmcnt(0) 4292; GFX908-NEXT: s_setpc_b64 s[30:31] 4293; 4294; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4295; GFX8: ; %bb.0: 4296; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4297; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 4298; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4299; GFX8-NEXT: flat_load_dword v4, v[0:1] 4300; GFX8-NEXT: s_mov_b64 s[4:5], 0 4301; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start 4302; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4303; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4304; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 4305; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4306; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4307; GFX8-NEXT: buffer_wbinvl1 4308; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4309; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4310; GFX8-NEXT: v_mov_b32_e32 v4, v3 4311; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4312; GFX8-NEXT: s_cbranch_execnz .LBB21_1 4313; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4314; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4315; GFX8-NEXT: s_setpc_b64 s[30:31] 4316; 4317; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 4318; GFX7: ; %bb.0: 4319; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4320; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 4321; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4322; GFX7-NEXT: flat_load_dword v4, v[0:1] 4323; GFX7-NEXT: s_mov_b64 s[4:5], 0 4324; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start 4325; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4326; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4327; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 4328; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4329; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4330; GFX7-NEXT: buffer_wbinvl1 4331; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4332; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4333; GFX7-NEXT: v_mov_b32_e32 v4, v3 4334; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4335; GFX7-NEXT: s_cbranch_execnz .LBB21_1 4336; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4337; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4338; GFX7-NEXT: s_setpc_b64 s[30:31] 4339 %gep = getelementptr float, ptr %ptr, i64 511 4340 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 4341 ret void 4342} 4343 4344define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 4345; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4346; GFX12: ; %bb.0: 4347; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4348; GFX12-NEXT: s_wait_expcnt 0x0 4349; GFX12-NEXT: s_wait_samplecnt 0x0 4350; GFX12-NEXT: s_wait_bvhcnt 0x0 4351; GFX12-NEXT: s_wait_kmcnt 0x0 4352; GFX12-NEXT: s_wait_storecnt 0x0 4353; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4354; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4355; GFX12-NEXT: global_inv scope:SCOPE_DEV 4356; GFX12-NEXT: s_setpc_b64 s[30:31] 4357; 4358; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4359; GFX940: ; %bb.0: 4360; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4361; GFX940-NEXT: buffer_wbl2 sc1 4362; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 4363; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4364; GFX940-NEXT: buffer_inv sc1 4365; GFX940-NEXT: s_setpc_b64 s[30:31] 4366; 4367; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4368; GFX11: ; %bb.0: 4369; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4370; GFX11-NEXT: flat_load_b32 v3, v[0:1] 4371; GFX11-NEXT: s_mov_b32 s0, 0 4372; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start 4373; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4374; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4375; GFX11-NEXT: v_mov_b32_e32 v4, v3 4376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4377; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 4378; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4379; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 4380; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4381; GFX11-NEXT: buffer_gl1_inv 4382; GFX11-NEXT: buffer_gl0_inv 4383; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4384; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 4385; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4386; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 4387; GFX11-NEXT: s_cbranch_execnz .LBB22_1 4388; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4389; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4390; GFX11-NEXT: v_mov_b32_e32 v0, v3 4391; GFX11-NEXT: s_setpc_b64 s[30:31] 4392; 4393; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4394; GFX10: ; %bb.0: 4395; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4396; GFX10-NEXT: flat_load_dword v3, v[0:1] 4397; GFX10-NEXT: s_mov_b32 s4, 0 4398; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start 4399; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4400; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4401; GFX10-NEXT: v_mov_b32_e32 v4, v3 4402; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 4403; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4404; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4405; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4406; GFX10-NEXT: buffer_gl1_inv 4407; GFX10-NEXT: buffer_gl0_inv 4408; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4409; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 4410; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 4411; GFX10-NEXT: s_cbranch_execnz .LBB22_1 4412; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4413; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4414; GFX10-NEXT: v_mov_b32_e32 v0, v3 4415; GFX10-NEXT: s_setpc_b64 s[30:31] 4416; 4417; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4418; GFX90A: ; %bb.0: 4419; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4420; GFX90A-NEXT: flat_load_dword v3, v[0:1] 4421; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4422; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start 4423; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4424; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4425; GFX90A-NEXT: v_mov_b32_e32 v5, v3 4426; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 4427; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 4428; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4429; GFX90A-NEXT: buffer_wbinvl1 4430; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4431; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4432; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4433; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 4434; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4435; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4436; GFX90A-NEXT: v_mov_b32_e32 v0, v3 4437; GFX90A-NEXT: s_setpc_b64 s[30:31] 4438; 4439; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4440; GFX908: ; %bb.0: 4441; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4442; GFX908-NEXT: flat_load_dword v3, v[0:1] 4443; GFX908-NEXT: s_mov_b64 s[4:5], 0 4444; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start 4445; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4446; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4447; GFX908-NEXT: v_mov_b32_e32 v4, v3 4448; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 4449; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4450; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4451; GFX908-NEXT: buffer_wbinvl1 4452; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4453; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4454; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4455; GFX908-NEXT: s_cbranch_execnz .LBB22_1 4456; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4457; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4458; GFX908-NEXT: v_mov_b32_e32 v0, v3 4459; GFX908-NEXT: s_setpc_b64 s[30:31] 4460; 4461; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4462; GFX8: ; %bb.0: 4463; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4464; GFX8-NEXT: flat_load_dword v3, v[0:1] 4465; GFX8-NEXT: s_mov_b64 s[4:5], 0 4466; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start 4467; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4468; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4469; GFX8-NEXT: v_mov_b32_e32 v4, v3 4470; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 4471; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4472; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4473; GFX8-NEXT: buffer_wbinvl1 4474; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4475; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4476; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4477; GFX8-NEXT: s_cbranch_execnz .LBB22_1 4478; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4479; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4480; GFX8-NEXT: v_mov_b32_e32 v0, v3 4481; GFX8-NEXT: s_setpc_b64 s[30:31] 4482; 4483; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4484; GFX7: ; %bb.0: 4485; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4486; GFX7-NEXT: flat_load_dword v3, v[0:1] 4487; GFX7-NEXT: s_mov_b64 s[4:5], 0 4488; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start 4489; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4490; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4491; GFX7-NEXT: v_mov_b32_e32 v4, v3 4492; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 4493; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4494; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4495; GFX7-NEXT: buffer_wbinvl1 4496; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4497; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4498; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4499; GFX7-NEXT: s_cbranch_execnz .LBB22_1 4500; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4501; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4502; GFX7-NEXT: v_mov_b32_e32 v0, v3 4503; GFX7-NEXT: s_setpc_b64 s[30:31] 4504 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 4505 ret float %result 4506} 4507 4508define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 4509; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4510; GFX12: ; %bb.0: 4511; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4512; GFX12-NEXT: s_wait_expcnt 0x0 4513; GFX12-NEXT: s_wait_samplecnt 0x0 4514; GFX12-NEXT: s_wait_bvhcnt 0x0 4515; GFX12-NEXT: s_wait_kmcnt 0x0 4516; GFX12-NEXT: s_wait_storecnt 0x0 4517; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 4518; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 4519; GFX12-NEXT: global_inv scope:SCOPE_DEV 4520; GFX12-NEXT: s_setpc_b64 s[30:31] 4521; 4522; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4523; GFX940: ; %bb.0: 4524; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4525; GFX940-NEXT: buffer_wbl2 sc1 4526; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 4527; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4528; GFX940-NEXT: buffer_inv sc1 4529; GFX940-NEXT: s_setpc_b64 s[30:31] 4530; 4531; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4532; GFX11: ; %bb.0: 4533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4534; GFX11-NEXT: flat_load_b32 v4, v[0:1] 4535; GFX11-NEXT: s_mov_b32 s0, 0 4536; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start 4537; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4538; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4539; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 4540; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4541; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 4542; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4543; GFX11-NEXT: buffer_gl1_inv 4544; GFX11-NEXT: buffer_gl0_inv 4545; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4546; GFX11-NEXT: v_mov_b32_e32 v4, v3 4547; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 4548; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4549; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 4550; GFX11-NEXT: s_cbranch_execnz .LBB23_1 4551; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4552; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4553; GFX11-NEXT: s_setpc_b64 s[30:31] 4554; 4555; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4556; GFX10: ; %bb.0: 4557; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4558; GFX10-NEXT: flat_load_dword v4, v[0:1] 4559; GFX10-NEXT: s_mov_b32 s4, 0 4560; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start 4561; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4562; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4563; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 4564; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4565; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4566; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4567; GFX10-NEXT: buffer_gl1_inv 4568; GFX10-NEXT: buffer_gl0_inv 4569; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4570; GFX10-NEXT: v_mov_b32_e32 v4, v3 4571; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 4572; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 4573; GFX10-NEXT: s_cbranch_execnz .LBB23_1 4574; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4575; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4576; GFX10-NEXT: s_setpc_b64 s[30:31] 4577; 4578; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4579; GFX90A: ; %bb.0: 4580; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4581; GFX90A-NEXT: flat_load_dword v5, v[0:1] 4582; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4583; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start 4584; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4585; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4586; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 4587; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 4588; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4589; GFX90A-NEXT: buffer_wbinvl1 4590; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4591; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4592; GFX90A-NEXT: v_mov_b32_e32 v5, v3 4593; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4594; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 4595; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4596; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4597; GFX90A-NEXT: s_setpc_b64 s[30:31] 4598; 4599; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4600; GFX908: ; %bb.0: 4601; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4602; GFX908-NEXT: flat_load_dword v4, v[0:1] 4603; GFX908-NEXT: s_mov_b64 s[4:5], 0 4604; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start 4605; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4606; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4607; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 4608; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4609; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4610; GFX908-NEXT: buffer_wbinvl1 4611; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4612; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4613; GFX908-NEXT: v_mov_b32_e32 v4, v3 4614; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4615; GFX908-NEXT: s_cbranch_execnz .LBB23_1 4616; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4617; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4618; GFX908-NEXT: s_setpc_b64 s[30:31] 4619; 4620; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4621; GFX8: ; %bb.0: 4622; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4623; GFX8-NEXT: flat_load_dword v4, v[0:1] 4624; GFX8-NEXT: s_mov_b64 s[4:5], 0 4625; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start 4626; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4627; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4628; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 4629; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4630; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4631; GFX8-NEXT: buffer_wbinvl1 4632; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4633; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4634; GFX8-NEXT: v_mov_b32_e32 v4, v3 4635; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4636; GFX8-NEXT: s_cbranch_execnz .LBB23_1 4637; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4638; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4639; GFX8-NEXT: s_setpc_b64 s[30:31] 4640; 4641; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4642; GFX7: ; %bb.0: 4643; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4644; GFX7-NEXT: flat_load_dword v4, v[0:1] 4645; GFX7-NEXT: s_mov_b64 s[4:5], 0 4646; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start 4647; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4648; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4649; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 4650; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4651; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4652; GFX7-NEXT: buffer_wbinvl1 4653; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4654; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4655; GFX7-NEXT: v_mov_b32_e32 v4, v3 4656; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4657; GFX7-NEXT: s_cbranch_execnz .LBB23_1 4658; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4659; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4660; GFX7-NEXT: s_setpc_b64 s[30:31] 4661 %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 4662 ret void 4663} 4664 4665define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 4666; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4667; GFX12: ; %bb.0: 4668; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4669; GFX12-NEXT: s_wait_expcnt 0x0 4670; GFX12-NEXT: s_wait_samplecnt 0x0 4671; GFX12-NEXT: s_wait_bvhcnt 0x0 4672; GFX12-NEXT: s_wait_kmcnt 0x0 4673; GFX12-NEXT: s_wait_storecnt 0x0 4674; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4675; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4676; GFX12-NEXT: global_inv scope:SCOPE_DEV 4677; GFX12-NEXT: s_setpc_b64 s[30:31] 4678; 4679; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4680; GFX940: ; %bb.0: 4681; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4682; GFX940-NEXT: buffer_wbl2 sc1 4683; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 4684; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4685; GFX940-NEXT: buffer_inv sc1 4686; GFX940-NEXT: s_setpc_b64 s[30:31] 4687; 4688; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4689; GFX11: ; %bb.0: 4690; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4691; GFX11-NEXT: flat_load_b32 v3, v[0:1] 4692; GFX11-NEXT: s_mov_b32 s0, 0 4693; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start 4694; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4695; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4696; GFX11-NEXT: v_mov_b32_e32 v4, v3 4697; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4698; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 4699; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4700; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 4701; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4702; GFX11-NEXT: buffer_gl1_inv 4703; GFX11-NEXT: buffer_gl0_inv 4704; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4705; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 4706; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4707; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 4708; GFX11-NEXT: s_cbranch_execnz .LBB24_1 4709; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4710; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4711; GFX11-NEXT: v_mov_b32_e32 v0, v3 4712; GFX11-NEXT: s_setpc_b64 s[30:31] 4713; 4714; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4715; GFX10: ; %bb.0: 4716; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4717; GFX10-NEXT: flat_load_dword v3, v[0:1] 4718; GFX10-NEXT: s_mov_b32 s4, 0 4719; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start 4720; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4721; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4722; GFX10-NEXT: v_mov_b32_e32 v4, v3 4723; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 4724; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4725; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4726; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4727; GFX10-NEXT: buffer_gl1_inv 4728; GFX10-NEXT: buffer_gl0_inv 4729; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4730; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 4731; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 4732; GFX10-NEXT: s_cbranch_execnz .LBB24_1 4733; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4734; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4735; GFX10-NEXT: v_mov_b32_e32 v0, v3 4736; GFX10-NEXT: s_setpc_b64 s[30:31] 4737; 4738; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4739; GFX90A: ; %bb.0: 4740; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4741; GFX90A-NEXT: flat_load_dword v3, v[0:1] 4742; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4743; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start 4744; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4745; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4746; GFX90A-NEXT: v_mov_b32_e32 v5, v3 4747; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 4748; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 4749; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4750; GFX90A-NEXT: buffer_wbinvl1 4751; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4752; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4753; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4754; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 4755; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4756; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4757; GFX90A-NEXT: v_mov_b32_e32 v0, v3 4758; GFX90A-NEXT: s_setpc_b64 s[30:31] 4759; 4760; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4761; GFX908: ; %bb.0: 4762; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4763; GFX908-NEXT: flat_load_dword v3, v[0:1] 4764; GFX908-NEXT: s_mov_b64 s[4:5], 0 4765; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start 4766; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4767; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4768; GFX908-NEXT: v_mov_b32_e32 v4, v3 4769; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 4770; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4771; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4772; GFX908-NEXT: buffer_wbinvl1 4773; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4774; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4775; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4776; GFX908-NEXT: s_cbranch_execnz .LBB24_1 4777; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4778; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4779; GFX908-NEXT: v_mov_b32_e32 v0, v3 4780; GFX908-NEXT: s_setpc_b64 s[30:31] 4781; 4782; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4783; GFX8: ; %bb.0: 4784; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4785; GFX8-NEXT: flat_load_dword v3, v[0:1] 4786; GFX8-NEXT: s_mov_b64 s[4:5], 0 4787; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start 4788; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4789; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4790; GFX8-NEXT: v_mov_b32_e32 v4, v3 4791; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 4792; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4793; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4794; GFX8-NEXT: buffer_wbinvl1 4795; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4796; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4797; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4798; GFX8-NEXT: s_cbranch_execnz .LBB24_1 4799; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4800; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4801; GFX8-NEXT: v_mov_b32_e32 v0, v3 4802; GFX8-NEXT: s_setpc_b64 s[30:31] 4803; 4804; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: 4805; GFX7: ; %bb.0: 4806; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4807; GFX7-NEXT: flat_load_dword v3, v[0:1] 4808; GFX7-NEXT: s_mov_b64 s[4:5], 0 4809; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start 4810; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4811; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4812; GFX7-NEXT: v_mov_b32_e32 v4, v3 4813; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 4814; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4815; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4816; GFX7-NEXT: buffer_wbinvl1 4817; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4818; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4819; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4820; GFX7-NEXT: s_cbranch_execnz .LBB24_1 4821; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4822; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4823; GFX7-NEXT: v_mov_b32_e32 v0, v3 4824; GFX7-NEXT: s_setpc_b64 s[30:31] 4825 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 4826 ret float %result 4827} 4828 4829define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 4830; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4831; GFX12: ; %bb.0: 4832; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4833; GFX12-NEXT: s_wait_expcnt 0x0 4834; GFX12-NEXT: s_wait_samplecnt 0x0 4835; GFX12-NEXT: s_wait_bvhcnt 0x0 4836; GFX12-NEXT: s_wait_kmcnt 0x0 4837; GFX12-NEXT: s_wait_storecnt 0x0 4838; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 4839; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 4840; GFX12-NEXT: global_inv scope:SCOPE_DEV 4841; GFX12-NEXT: s_setpc_b64 s[30:31] 4842; 4843; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4844; GFX940: ; %bb.0: 4845; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4846; GFX940-NEXT: buffer_wbl2 sc1 4847; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 4848; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4849; GFX940-NEXT: buffer_inv sc1 4850; GFX940-NEXT: s_setpc_b64 s[30:31] 4851; 4852; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4853; GFX11: ; %bb.0: 4854; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4855; GFX11-NEXT: flat_load_b32 v4, v[0:1] 4856; GFX11-NEXT: s_mov_b32 s0, 0 4857; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start 4858; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4859; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4860; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 4861; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4862; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 4863; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4864; GFX11-NEXT: buffer_gl1_inv 4865; GFX11-NEXT: buffer_gl0_inv 4866; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4867; GFX11-NEXT: v_mov_b32_e32 v4, v3 4868; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 4869; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4870; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 4871; GFX11-NEXT: s_cbranch_execnz .LBB25_1 4872; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4873; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4874; GFX11-NEXT: s_setpc_b64 s[30:31] 4875; 4876; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4877; GFX10: ; %bb.0: 4878; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4879; GFX10-NEXT: flat_load_dword v4, v[0:1] 4880; GFX10-NEXT: s_mov_b32 s4, 0 4881; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start 4882; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4883; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4884; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 4885; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4886; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4887; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4888; GFX10-NEXT: buffer_gl1_inv 4889; GFX10-NEXT: buffer_gl0_inv 4890; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 4891; GFX10-NEXT: v_mov_b32_e32 v4, v3 4892; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 4893; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 4894; GFX10-NEXT: s_cbranch_execnz .LBB25_1 4895; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4896; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4897; GFX10-NEXT: s_setpc_b64 s[30:31] 4898; 4899; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4900; GFX90A: ; %bb.0: 4901; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4902; GFX90A-NEXT: flat_load_dword v5, v[0:1] 4903; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4904; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start 4905; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4906; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4907; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 4908; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 4909; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4910; GFX90A-NEXT: buffer_wbinvl1 4911; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 4912; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4913; GFX90A-NEXT: v_mov_b32_e32 v5, v3 4914; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4915; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 4916; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4917; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4918; GFX90A-NEXT: s_setpc_b64 s[30:31] 4919; 4920; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4921; GFX908: ; %bb.0: 4922; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4923; GFX908-NEXT: flat_load_dword v4, v[0:1] 4924; GFX908-NEXT: s_mov_b64 s[4:5], 0 4925; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start 4926; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4927; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4928; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 4929; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4930; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4931; GFX908-NEXT: buffer_wbinvl1 4932; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4933; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4934; GFX908-NEXT: v_mov_b32_e32 v4, v3 4935; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4936; GFX908-NEXT: s_cbranch_execnz .LBB25_1 4937; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4938; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4939; GFX908-NEXT: s_setpc_b64 s[30:31] 4940; 4941; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4942; GFX8: ; %bb.0: 4943; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4944; GFX8-NEXT: flat_load_dword v4, v[0:1] 4945; GFX8-NEXT: s_mov_b64 s[4:5], 0 4946; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start 4947; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4948; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4949; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 4950; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4951; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4952; GFX8-NEXT: buffer_wbinvl1 4953; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4954; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4955; GFX8-NEXT: v_mov_b32_e32 v4, v3 4956; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4957; GFX8-NEXT: s_cbranch_execnz .LBB25_1 4958; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4959; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4960; GFX8-NEXT: s_setpc_b64 s[30:31] 4961; 4962; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: 4963; GFX7: ; %bb.0: 4964; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4965; GFX7-NEXT: flat_load_dword v4, v[0:1] 4966; GFX7-NEXT: s_mov_b64 s[4:5], 0 4967; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start 4968; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4969; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4970; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 4971; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 4972; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4973; GFX7-NEXT: buffer_wbinvl1 4974; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 4975; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4976; GFX7-NEXT: v_mov_b32_e32 v4, v3 4977; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4978; GFX7-NEXT: s_cbranch_execnz .LBB25_1 4979; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4980; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4981; GFX7-NEXT: s_setpc_b64 s[30:31] 4982 %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 4983 ret void 4984} 4985 4986define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 4987; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 4988; GFX12: ; %bb.0: 4989; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4990; GFX12-NEXT: s_wait_expcnt 0x0 4991; GFX12-NEXT: s_wait_samplecnt 0x0 4992; GFX12-NEXT: s_wait_bvhcnt 0x0 4993; GFX12-NEXT: s_wait_kmcnt 0x0 4994; GFX12-NEXT: s_wait_storecnt 0x0 4995; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4996; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4997; GFX12-NEXT: global_inv scope:SCOPE_DEV 4998; GFX12-NEXT: s_setpc_b64 s[30:31] 4999; 5000; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5001; GFX940: ; %bb.0: 5002; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5003; GFX940-NEXT: buffer_wbl2 sc1 5004; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 5005; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5006; GFX940-NEXT: buffer_inv sc1 5007; GFX940-NEXT: s_setpc_b64 s[30:31] 5008; 5009; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5010; GFX11: ; %bb.0: 5011; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5012; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5013; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 5014; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5015; GFX11-NEXT: buffer_gl1_inv 5016; GFX11-NEXT: buffer_gl0_inv 5017; GFX11-NEXT: s_setpc_b64 s[30:31] 5018; 5019; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5020; GFX10: ; %bb.0: 5021; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5022; GFX10-NEXT: flat_load_dword v3, v[0:1] 5023; GFX10-NEXT: s_mov_b32 s4, 0 5024; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start 5025; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5026; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5027; GFX10-NEXT: v_mov_b32_e32 v4, v3 5028; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 5029; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5030; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5031; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5032; GFX10-NEXT: buffer_gl1_inv 5033; GFX10-NEXT: buffer_gl0_inv 5034; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 5035; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5036; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5037; GFX10-NEXT: s_cbranch_execnz .LBB26_1 5038; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5039; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5040; GFX10-NEXT: v_mov_b32_e32 v0, v3 5041; GFX10-NEXT: s_setpc_b64 s[30:31] 5042; 5043; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5044; GFX90A: ; %bb.0: 5045; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5046; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 5047; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5048; GFX90A-NEXT: ; implicit-def: $vgpr3 5049; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5050; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5051; GFX90A-NEXT: s_cbranch_execz .LBB26_6 5052; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private 5053; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 5054; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 5055; GFX90A-NEXT: ; implicit-def: $vgpr3 5056; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 5057; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 5058; GFX90A-NEXT: s_cbranch_execz .LBB26_3 5059; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global 5060; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc 5061; GFX90A-NEXT: s_waitcnt vmcnt(0) 5062; GFX90A-NEXT: buffer_wbinvl1 5063; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5064; GFX90A-NEXT: ; implicit-def: $vgpr2 5065; GFX90A-NEXT: .LBB26_3: ; %Flow 5066; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 5067; GFX90A-NEXT: s_cbranch_execz .LBB26_5 5068; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private 5069; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5070; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5071; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 5072; GFX90A-NEXT: s_waitcnt vmcnt(0) 5073; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 5074; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 5075; GFX90A-NEXT: .LBB26_5: ; %Flow1 5076; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5077; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5078; GFX90A-NEXT: ; implicit-def: $vgpr2 5079; GFX90A-NEXT: .LBB26_6: ; %Flow2 5080; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5081; GFX90A-NEXT: s_cbranch_execz .LBB26_8 5082; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared 5083; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5084; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5085; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 5086; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5087; GFX90A-NEXT: .LBB26_8: ; %atomicrmw.phi 5088; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5089; GFX90A-NEXT: v_mov_b32_e32 v0, v3 5090; GFX90A-NEXT: s_waitcnt vmcnt(0) 5091; GFX90A-NEXT: s_setpc_b64 s[30:31] 5092; 5093; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5094; GFX908: ; %bb.0: 5095; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5096; GFX908-NEXT: flat_load_dword v3, v[0:1] 5097; GFX908-NEXT: s_mov_b64 s[4:5], 0 5098; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start 5099; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5100; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5101; GFX908-NEXT: v_mov_b32_e32 v4, v3 5102; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 5103; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5104; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5105; GFX908-NEXT: buffer_wbinvl1 5106; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5107; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5108; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5109; GFX908-NEXT: s_cbranch_execnz .LBB26_1 5110; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5111; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5112; GFX908-NEXT: v_mov_b32_e32 v0, v3 5113; GFX908-NEXT: s_setpc_b64 s[30:31] 5114; 5115; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5116; GFX8: ; %bb.0: 5117; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5118; GFX8-NEXT: flat_load_dword v3, v[0:1] 5119; GFX8-NEXT: s_mov_b64 s[4:5], 0 5120; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start 5121; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5122; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5123; GFX8-NEXT: v_mov_b32_e32 v4, v3 5124; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 5125; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5126; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5127; GFX8-NEXT: buffer_wbinvl1 5128; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5129; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5130; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5131; GFX8-NEXT: s_cbranch_execnz .LBB26_1 5132; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5133; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5134; GFX8-NEXT: v_mov_b32_e32 v0, v3 5135; GFX8-NEXT: s_setpc_b64 s[30:31] 5136; 5137; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5138; GFX7: ; %bb.0: 5139; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5140; GFX7-NEXT: flat_load_dword v3, v[0:1] 5141; GFX7-NEXT: s_mov_b64 s[4:5], 0 5142; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start 5143; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5144; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5145; GFX7-NEXT: v_mov_b32_e32 v4, v3 5146; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 5147; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5148; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5149; GFX7-NEXT: buffer_wbinvl1 5150; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5151; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5152; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5153; GFX7-NEXT: s_cbranch_execnz .LBB26_1 5154; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5155; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5156; GFX7-NEXT: v_mov_b32_e32 v0, v3 5157; GFX7-NEXT: s_setpc_b64 s[30:31] 5158 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 5159 ret float %result 5160} 5161 5162define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 { 5163; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5164; GFX12: ; %bb.0: 5165; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5166; GFX12-NEXT: s_wait_expcnt 0x0 5167; GFX12-NEXT: s_wait_samplecnt 0x0 5168; GFX12-NEXT: s_wait_bvhcnt 0x0 5169; GFX12-NEXT: s_wait_kmcnt 0x0 5170; GFX12-NEXT: s_wait_storecnt 0x0 5171; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 5172; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 5173; GFX12-NEXT: global_inv scope:SCOPE_DEV 5174; GFX12-NEXT: s_setpc_b64 s[30:31] 5175; 5176; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5177; GFX940: ; %bb.0: 5178; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5179; GFX940-NEXT: buffer_wbl2 sc1 5180; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 5181; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5182; GFX940-NEXT: buffer_inv sc1 5183; GFX940-NEXT: s_setpc_b64 s[30:31] 5184; 5185; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5186; GFX11: ; %bb.0: 5187; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5188; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5189; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 5190; GFX11-NEXT: s_waitcnt lgkmcnt(0) 5191; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5192; GFX11-NEXT: buffer_gl1_inv 5193; GFX11-NEXT: buffer_gl0_inv 5194; GFX11-NEXT: s_setpc_b64 s[30:31] 5195; 5196; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5197; GFX10: ; %bb.0: 5198; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5199; GFX10-NEXT: flat_load_dword v4, v[0:1] 5200; GFX10-NEXT: s_mov_b32 s4, 0 5201; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start 5202; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5203; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5204; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 5205; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5206; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5207; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5208; GFX10-NEXT: buffer_gl1_inv 5209; GFX10-NEXT: buffer_gl0_inv 5210; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 5211; GFX10-NEXT: v_mov_b32_e32 v4, v3 5212; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5213; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5214; GFX10-NEXT: s_cbranch_execnz .LBB27_1 5215; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5216; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5217; GFX10-NEXT: s_setpc_b64 s[30:31] 5218; 5219; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5220; GFX90A: ; %bb.0: 5221; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5222; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 5223; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5224; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5225; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5226; GFX90A-NEXT: s_cbranch_execnz .LBB27_3 5227; GFX90A-NEXT: ; %bb.1: ; %Flow2 5228; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5229; GFX90A-NEXT: s_cbranch_execnz .LBB27_8 5230; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.phi 5231; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5232; GFX90A-NEXT: s_waitcnt vmcnt(0) 5233; GFX90A-NEXT: s_setpc_b64 s[30:31] 5234; GFX90A-NEXT: .LBB27_3: ; %atomicrmw.check.private 5235; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 5236; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 5237; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 5238; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 5239; GFX90A-NEXT: s_cbranch_execz .LBB27_5 5240; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 5241; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 5242; GFX90A-NEXT: s_waitcnt vmcnt(0) 5243; GFX90A-NEXT: buffer_wbinvl1 5244; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5245; GFX90A-NEXT: ; implicit-def: $vgpr2 5246; GFX90A-NEXT: .LBB27_5: ; %Flow 5247; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 5248; GFX90A-NEXT: s_cbranch_execz .LBB27_7 5249; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 5250; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5251; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5252; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 5253; GFX90A-NEXT: s_waitcnt vmcnt(0) 5254; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 5255; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 5256; GFX90A-NEXT: .LBB27_7: ; %Flow1 5257; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5258; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5259; GFX90A-NEXT: ; implicit-def: $vgpr2 5260; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5261; GFX90A-NEXT: s_cbranch_execz .LBB27_2 5262; GFX90A-NEXT: .LBB27_8: ; %atomicrmw.shared 5263; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5264; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5265; GFX90A-NEXT: ds_add_f32 v0, v2 5266; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5267; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5268; GFX90A-NEXT: s_waitcnt vmcnt(0) 5269; GFX90A-NEXT: s_setpc_b64 s[30:31] 5270; 5271; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5272; GFX908: ; %bb.0: 5273; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5274; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 5275; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5276; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5277; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5278; GFX908-NEXT: s_cbranch_execnz .LBB27_3 5279; GFX908-NEXT: ; %bb.1: ; %Flow2 5280; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5281; GFX908-NEXT: s_cbranch_execnz .LBB27_8 5282; GFX908-NEXT: .LBB27_2: ; %atomicrmw.phi 5283; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5284; GFX908-NEXT: s_waitcnt vmcnt(0) 5285; GFX908-NEXT: s_setpc_b64 s[30:31] 5286; GFX908-NEXT: .LBB27_3: ; %atomicrmw.check.private 5287; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 5288; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 5289; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 5290; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 5291; GFX908-NEXT: s_cbranch_execz .LBB27_5 5292; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 5293; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 5294; GFX908-NEXT: s_waitcnt vmcnt(0) 5295; GFX908-NEXT: buffer_wbinvl1 5296; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5297; GFX908-NEXT: ; implicit-def: $vgpr2 5298; GFX908-NEXT: .LBB27_5: ; %Flow 5299; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 5300; GFX908-NEXT: s_cbranch_execz .LBB27_7 5301; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 5302; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5303; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5304; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 5305; GFX908-NEXT: s_waitcnt vmcnt(0) 5306; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 5307; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 5308; GFX908-NEXT: .LBB27_7: ; %Flow1 5309; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5310; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5311; GFX908-NEXT: ; implicit-def: $vgpr2 5312; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5313; GFX908-NEXT: s_cbranch_execz .LBB27_2 5314; GFX908-NEXT: .LBB27_8: ; %atomicrmw.shared 5315; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5316; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 5317; GFX908-NEXT: ds_add_f32 v0, v2 5318; GFX908-NEXT: s_waitcnt lgkmcnt(0) 5319; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5320; GFX908-NEXT: s_waitcnt vmcnt(0) 5321; GFX908-NEXT: s_setpc_b64 s[30:31] 5322; 5323; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5324; GFX8: ; %bb.0: 5325; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5326; GFX8-NEXT: flat_load_dword v4, v[0:1] 5327; GFX8-NEXT: s_mov_b64 s[4:5], 0 5328; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start 5329; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5330; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5331; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 5332; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5333; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5334; GFX8-NEXT: buffer_wbinvl1 5335; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5336; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5337; GFX8-NEXT: v_mov_b32_e32 v4, v3 5338; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5339; GFX8-NEXT: s_cbranch_execnz .LBB27_1 5340; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5341; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5342; GFX8-NEXT: s_setpc_b64 s[30:31] 5343; 5344; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 5345; GFX7: ; %bb.0: 5346; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5347; GFX7-NEXT: flat_load_dword v4, v[0:1] 5348; GFX7-NEXT: s_mov_b64 s[4:5], 0 5349; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start 5350; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5351; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5352; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 5353; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5354; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5355; GFX7-NEXT: buffer_wbinvl1 5356; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5357; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5358; GFX7-NEXT: v_mov_b32_e32 v4, v3 5359; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5360; GFX7-NEXT: s_cbranch_execnz .LBB27_1 5361; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5362; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5363; GFX7-NEXT: s_setpc_b64 s[30:31] 5364 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 5365 ret void 5366} 5367 5368define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 5369; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5370; GFX12: ; %bb.0: 5371; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5372; GFX12-NEXT: s_wait_expcnt 0x0 5373; GFX12-NEXT: s_wait_samplecnt 0x0 5374; GFX12-NEXT: s_wait_bvhcnt 0x0 5375; GFX12-NEXT: s_wait_kmcnt 0x0 5376; GFX12-NEXT: s_wait_storecnt 0x0 5377; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5378; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5379; GFX12-NEXT: global_inv scope:SCOPE_DEV 5380; GFX12-NEXT: s_setpc_b64 s[30:31] 5381; 5382; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5383; GFX940: ; %bb.0: 5384; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5385; GFX940-NEXT: buffer_wbl2 sc1 5386; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 5387; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5388; GFX940-NEXT: buffer_inv sc1 5389; GFX940-NEXT: s_setpc_b64 s[30:31] 5390; 5391; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5392; GFX11: ; %bb.0: 5393; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5394; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5395; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 5396; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5397; GFX11-NEXT: buffer_gl1_inv 5398; GFX11-NEXT: buffer_gl0_inv 5399; GFX11-NEXT: s_setpc_b64 s[30:31] 5400; 5401; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5402; GFX10: ; %bb.0: 5403; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5404; GFX10-NEXT: flat_load_dword v3, v[0:1] 5405; GFX10-NEXT: s_mov_b32 s4, 0 5406; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start 5407; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5408; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5409; GFX10-NEXT: v_mov_b32_e32 v4, v3 5410; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 5411; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5412; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5413; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5414; GFX10-NEXT: buffer_gl1_inv 5415; GFX10-NEXT: buffer_gl0_inv 5416; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 5417; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5418; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5419; GFX10-NEXT: s_cbranch_execnz .LBB28_1 5420; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5421; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5422; GFX10-NEXT: v_mov_b32_e32 v0, v3 5423; GFX10-NEXT: s_setpc_b64 s[30:31] 5424; 5425; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5426; GFX90A: ; %bb.0: 5427; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5428; GFX90A-NEXT: flat_load_dword v3, v[0:1] 5429; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5430; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start 5431; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5432; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5433; GFX90A-NEXT: v_mov_b32_e32 v5, v3 5434; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 5435; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 5436; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5437; GFX90A-NEXT: buffer_wbinvl1 5438; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 5439; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5440; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 5441; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 5442; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 5443; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5444; GFX90A-NEXT: v_mov_b32_e32 v0, v3 5445; GFX90A-NEXT: s_setpc_b64 s[30:31] 5446; 5447; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5448; GFX908: ; %bb.0: 5449; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5450; GFX908-NEXT: flat_load_dword v3, v[0:1] 5451; GFX908-NEXT: s_mov_b64 s[4:5], 0 5452; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start 5453; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5454; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5455; GFX908-NEXT: v_mov_b32_e32 v4, v3 5456; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 5457; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5458; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5459; GFX908-NEXT: buffer_wbinvl1 5460; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5461; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5462; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5463; GFX908-NEXT: s_cbranch_execnz .LBB28_1 5464; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5465; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5466; GFX908-NEXT: v_mov_b32_e32 v0, v3 5467; GFX908-NEXT: s_setpc_b64 s[30:31] 5468; 5469; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5470; GFX8: ; %bb.0: 5471; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5472; GFX8-NEXT: flat_load_dword v3, v[0:1] 5473; GFX8-NEXT: s_mov_b64 s[4:5], 0 5474; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start 5475; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5476; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5477; GFX8-NEXT: v_mov_b32_e32 v4, v3 5478; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 5479; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5480; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5481; GFX8-NEXT: buffer_wbinvl1 5482; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5483; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5484; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5485; GFX8-NEXT: s_cbranch_execnz .LBB28_1 5486; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5487; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5488; GFX8-NEXT: v_mov_b32_e32 v0, v3 5489; GFX8-NEXT: s_setpc_b64 s[30:31] 5490; 5491; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5492; GFX7: ; %bb.0: 5493; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5494; GFX7-NEXT: flat_load_dword v3, v[0:1] 5495; GFX7-NEXT: s_mov_b64 s[4:5], 0 5496; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start 5497; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5498; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5499; GFX7-NEXT: v_mov_b32_e32 v4, v3 5500; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 5501; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5502; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5503; GFX7-NEXT: buffer_wbinvl1 5504; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5505; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5506; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5507; GFX7-NEXT: s_cbranch_execnz .LBB28_1 5508; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5509; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5510; GFX7-NEXT: v_mov_b32_e32 v0, v3 5511; GFX7-NEXT: s_setpc_b64 s[30:31] 5512 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 5513 ret float %result 5514} 5515 5516define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 5517; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5518; GFX12: ; %bb.0: 5519; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5520; GFX12-NEXT: s_wait_expcnt 0x0 5521; GFX12-NEXT: s_wait_samplecnt 0x0 5522; GFX12-NEXT: s_wait_bvhcnt 0x0 5523; GFX12-NEXT: s_wait_kmcnt 0x0 5524; GFX12-NEXT: s_wait_storecnt 0x0 5525; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV 5526; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 5527; GFX12-NEXT: global_inv scope:SCOPE_DEV 5528; GFX12-NEXT: s_setpc_b64 s[30:31] 5529; 5530; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5531; GFX940: ; %bb.0: 5532; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5533; GFX940-NEXT: buffer_wbl2 sc1 5534; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 5535; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5536; GFX940-NEXT: buffer_inv sc1 5537; GFX940-NEXT: s_setpc_b64 s[30:31] 5538; 5539; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5540; GFX11: ; %bb.0: 5541; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5542; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5543; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 5544; GFX11-NEXT: s_waitcnt lgkmcnt(0) 5545; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5546; GFX11-NEXT: buffer_gl1_inv 5547; GFX11-NEXT: buffer_gl0_inv 5548; GFX11-NEXT: s_setpc_b64 s[30:31] 5549; 5550; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5551; GFX10: ; %bb.0: 5552; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5553; GFX10-NEXT: flat_load_dword v4, v[0:1] 5554; GFX10-NEXT: s_mov_b32 s4, 0 5555; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start 5556; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5557; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5558; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 5559; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5560; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5561; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5562; GFX10-NEXT: buffer_gl1_inv 5563; GFX10-NEXT: buffer_gl0_inv 5564; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 5565; GFX10-NEXT: v_mov_b32_e32 v4, v3 5566; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5567; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5568; GFX10-NEXT: s_cbranch_execnz .LBB29_1 5569; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5570; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5571; GFX10-NEXT: s_setpc_b64 s[30:31] 5572; 5573; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5574; GFX90A: ; %bb.0: 5575; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5576; GFX90A-NEXT: flat_load_dword v5, v[0:1] 5577; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5578; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start 5579; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5580; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5581; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 5582; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 5583; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5584; GFX90A-NEXT: buffer_wbinvl1 5585; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 5586; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5587; GFX90A-NEXT: v_mov_b32_e32 v5, v3 5588; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 5589; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 5590; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 5591; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5592; GFX90A-NEXT: s_setpc_b64 s[30:31] 5593; 5594; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5595; GFX908: ; %bb.0: 5596; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5597; GFX908-NEXT: flat_load_dword v4, v[0:1] 5598; GFX908-NEXT: s_mov_b64 s[4:5], 0 5599; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start 5600; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5601; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5602; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 5603; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5604; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5605; GFX908-NEXT: buffer_wbinvl1 5606; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5607; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5608; GFX908-NEXT: v_mov_b32_e32 v4, v3 5609; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5610; GFX908-NEXT: s_cbranch_execnz .LBB29_1 5611; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5612; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5613; GFX908-NEXT: s_setpc_b64 s[30:31] 5614; 5615; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5616; GFX8: ; %bb.0: 5617; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5618; GFX8-NEXT: flat_load_dword v4, v[0:1] 5619; GFX8-NEXT: s_mov_b64 s[4:5], 0 5620; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start 5621; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5622; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5623; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 5624; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5625; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5626; GFX8-NEXT: buffer_wbinvl1 5627; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5628; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5629; GFX8-NEXT: v_mov_b32_e32 v4, v3 5630; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5631; GFX8-NEXT: s_cbranch_execnz .LBB29_1 5632; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5633; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5634; GFX8-NEXT: s_setpc_b64 s[30:31] 5635; 5636; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: 5637; GFX7: ; %bb.0: 5638; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5639; GFX7-NEXT: flat_load_dword v4, v[0:1] 5640; GFX7-NEXT: s_mov_b64 s[4:5], 0 5641; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start 5642; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5643; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5644; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 5645; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 5646; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5647; GFX7-NEXT: buffer_wbinvl1 5648; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 5649; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5650; GFX7-NEXT: v_mov_b32_e32 v4, v3 5651; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5652; GFX7-NEXT: s_cbranch_execnz .LBB29_1 5653; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5654; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5655; GFX7-NEXT: s_setpc_b64 s[30:31] 5656 %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 5657 ret void 5658} 5659 5660; -------------------------------------------------------------------- 5661; double 5662; -------------------------------------------------------------------- 5663 5664define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 5665; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5666; GFX12: ; %bb.0: 5667; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5668; GFX12-NEXT: s_wait_expcnt 0x0 5669; GFX12-NEXT: s_wait_samplecnt 0x0 5670; GFX12-NEXT: s_wait_bvhcnt 0x0 5671; GFX12-NEXT: s_wait_kmcnt 0x0 5672; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 5673; GFX12-NEXT: s_mov_b32 s0, exec_lo 5674; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 5675; GFX12-NEXT: s_wait_alu 0xfffe 5676; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 5677; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 5678; GFX12-NEXT: s_cbranch_execz .LBB30_4 5679; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global 5680; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] 5681; GFX12-NEXT: s_mov_b32 s1, 0 5682; GFX12-NEXT: .LBB30_2: ; %atomicrmw.start 5683; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5684; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5685; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 5686; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5687; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] 5688; GFX12-NEXT: s_wait_storecnt 0x0 5689; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5690; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5691; GFX12-NEXT: global_inv scope:SCOPE_DEV 5692; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5693; GFX12-NEXT: s_wait_alu 0xfffe 5694; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 5695; GFX12-NEXT: s_wait_alu 0xfffe 5696; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5697; GFX12-NEXT: s_cbranch_execnz .LBB30_2 5698; GFX12-NEXT: ; %bb.3: ; %Flow 5699; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 5700; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 5701; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 5702; GFX12-NEXT: .LBB30_4: ; %Flow3 5703; GFX12-NEXT: s_wait_alu 0xfffe 5704; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 5705; GFX12-NEXT: s_cbranch_execz .LBB30_6 5706; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private 5707; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5708; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5709; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off 5710; GFX12-NEXT: s_wait_loadcnt 0x0 5711; GFX12-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] 5712; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off 5713; GFX12-NEXT: .LBB30_6: ; %atomicrmw.phi 5714; GFX12-NEXT: s_wait_alu 0xfffe 5715; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5716; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 5717; GFX12-NEXT: s_wait_alu 0xfffe 5718; GFX12-NEXT: s_setpc_b64 s[30:31] 5719; 5720; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5721; GFX940: ; %bb.0: 5722; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5723; GFX940-NEXT: v_mov_b32_e32 v5, v1 5724; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 5725; GFX940-NEXT: v_mov_b32_e32 v4, v0 5726; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 5727; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 5728; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 5729; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 5730; GFX940-NEXT: s_cbranch_execnz .LBB30_3 5731; GFX940-NEXT: ; %bb.1: ; %Flow 5732; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5733; GFX940-NEXT: s_cbranch_execnz .LBB30_4 5734; GFX940-NEXT: .LBB30_2: ; %atomicrmw.phi 5735; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5736; GFX940-NEXT: s_setpc_b64 s[30:31] 5737; GFX940-NEXT: .LBB30_3: ; %atomicrmw.global 5738; GFX940-NEXT: buffer_wbl2 sc1 5739; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 5740; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5741; GFX940-NEXT: buffer_inv sc1 5742; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 5743; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 5744; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5745; GFX940-NEXT: s_cbranch_execz .LBB30_2 5746; GFX940-NEXT: .LBB30_4: ; %atomicrmw.private 5747; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5748; GFX940-NEXT: s_nop 1 5749; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 5750; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 5751; GFX940-NEXT: s_waitcnt vmcnt(0) 5752; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 5753; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 5754; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5755; GFX940-NEXT: s_waitcnt vmcnt(0) 5756; GFX940-NEXT: s_setpc_b64 s[30:31] 5757; 5758; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5759; GFX11: ; %bb.0: 5760; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5761; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 5762; GFX11-NEXT: s_mov_b32 s0, exec_lo 5763; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 5764; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 5765; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 5766; GFX11-NEXT: s_cbranch_execz .LBB30_4 5767; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global 5768; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] 5769; GFX11-NEXT: s_mov_b32 s1, 0 5770; GFX11-NEXT: .LBB30_2: ; %atomicrmw.start 5771; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5772; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5773; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 5774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5775; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 5776; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5777; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 5778; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5779; GFX11-NEXT: buffer_gl1_inv 5780; GFX11-NEXT: buffer_gl0_inv 5781; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5782; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5783; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5784; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5785; GFX11-NEXT: s_cbranch_execnz .LBB30_2 5786; GFX11-NEXT: ; %bb.3: ; %Flow 5787; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5788; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 5789; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 5790; GFX11-NEXT: .LBB30_4: ; %Flow3 5791; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5792; GFX11-NEXT: s_cbranch_execz .LBB30_6 5793; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private 5794; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5795; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5796; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off 5797; GFX11-NEXT: s_waitcnt vmcnt(0) 5798; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] 5799; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off 5800; GFX11-NEXT: .LBB30_6: ; %atomicrmw.phi 5801; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5802; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 5803; GFX11-NEXT: s_setpc_b64 s[30:31] 5804; 5805; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5806; GFX10: ; %bb.0: 5807; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5808; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 5809; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 5810; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 5811; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 5812; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 5813; GFX10-NEXT: s_cbranch_execz .LBB30_4 5814; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global 5815; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 5816; GFX10-NEXT: s_mov_b32 s5, 0 5817; GFX10-NEXT: .LBB30_2: ; %atomicrmw.start 5818; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5819; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5820; GFX10-NEXT: v_mov_b32_e32 v7, v5 5821; GFX10-NEXT: v_mov_b32_e32 v6, v4 5822; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 5823; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5824; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5825; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5826; GFX10-NEXT: buffer_gl1_inv 5827; GFX10-NEXT: buffer_gl0_inv 5828; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5829; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5830; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5831; GFX10-NEXT: s_cbranch_execnz .LBB30_2 5832; GFX10-NEXT: ; %bb.3: ; %Flow 5833; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5834; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 5835; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 5836; GFX10-NEXT: .LBB30_4: ; %Flow3 5837; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5838; GFX10-NEXT: s_cbranch_execz .LBB30_6 5839; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private 5840; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5841; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5842; GFX10-NEXT: s_clause 0x1 5843; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 5844; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 5845; GFX10-NEXT: s_waitcnt vmcnt(0) 5846; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] 5847; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5848; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5849; GFX10-NEXT: .LBB30_6: ; %atomicrmw.phi 5850; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5851; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5852; GFX10-NEXT: v_mov_b32_e32 v0, v4 5853; GFX10-NEXT: v_mov_b32_e32 v1, v5 5854; GFX10-NEXT: s_setpc_b64 s[30:31] 5855; 5856; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5857; GFX90A: ; %bb.0: 5858; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5859; GFX90A-NEXT: v_mov_b32_e32 v5, v1 5860; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 5861; GFX90A-NEXT: v_mov_b32_e32 v4, v0 5862; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 5863; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5864; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5865; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5866; GFX90A-NEXT: s_cbranch_execnz .LBB30_3 5867; GFX90A-NEXT: ; %bb.1: ; %Flow 5868; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5869; GFX90A-NEXT: s_cbranch_execnz .LBB30_4 5870; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.phi 5871; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5872; GFX90A-NEXT: s_setpc_b64 s[30:31] 5873; GFX90A-NEXT: .LBB30_3: ; %atomicrmw.global 5874; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc 5875; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5876; GFX90A-NEXT: buffer_wbinvl1 5877; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 5878; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 5879; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5880; GFX90A-NEXT: s_cbranch_execz .LBB30_2 5881; GFX90A-NEXT: .LBB30_4: ; %atomicrmw.private 5882; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5883; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 5884; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5885; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5886; GFX90A-NEXT: s_waitcnt vmcnt(0) 5887; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 5888; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 5889; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 5890; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5891; GFX90A-NEXT: s_waitcnt vmcnt(0) 5892; GFX90A-NEXT: s_setpc_b64 s[30:31] 5893; 5894; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5895; GFX908: ; %bb.0: 5896; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5897; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5898; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5899; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 5900; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5901; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5902; GFX908-NEXT: s_cbranch_execz .LBB30_4 5903; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global 5904; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 5905; GFX908-NEXT: s_mov_b64 s[6:7], 0 5906; GFX908-NEXT: .LBB30_2: ; %atomicrmw.start 5907; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5908; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5909; GFX908-NEXT: v_mov_b32_e32 v7, v5 5910; GFX908-NEXT: v_mov_b32_e32 v6, v4 5911; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 5912; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5913; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5914; GFX908-NEXT: buffer_wbinvl1 5915; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5916; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5917; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5918; GFX908-NEXT: s_cbranch_execnz .LBB30_2 5919; GFX908-NEXT: ; %bb.3: ; %Flow 5920; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5921; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5922; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 5923; GFX908-NEXT: .LBB30_4: ; %Flow3 5924; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5925; GFX908-NEXT: s_cbranch_execz .LBB30_6 5926; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private 5927; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5928; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5929; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 5930; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 5931; GFX908-NEXT: s_waitcnt vmcnt(0) 5932; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] 5933; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5934; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5935; GFX908-NEXT: .LBB30_6: ; %atomicrmw.phi 5936; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5937; GFX908-NEXT: v_mov_b32_e32 v0, v4 5938; GFX908-NEXT: v_mov_b32_e32 v1, v5 5939; GFX908-NEXT: s_waitcnt vmcnt(0) 5940; GFX908-NEXT: s_setpc_b64 s[30:31] 5941; 5942; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5943; GFX8: ; %bb.0: 5944; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5945; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5946; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5947; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 5948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5949; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5950; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5951; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5952; GFX8-NEXT: s_cbranch_execz .LBB30_4 5953; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global 5954; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 5955; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5956; GFX8-NEXT: flat_load_dword v5, v[4:5] 5957; GFX8-NEXT: flat_load_dword v4, v[0:1] 5958; GFX8-NEXT: s_mov_b64 s[6:7], 0 5959; GFX8-NEXT: .LBB30_2: ; %atomicrmw.start 5960; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5961; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5962; GFX8-NEXT: v_mov_b32_e32 v7, v5 5963; GFX8-NEXT: v_mov_b32_e32 v6, v4 5964; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 5965; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5966; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5967; GFX8-NEXT: buffer_wbinvl1 5968; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5969; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5970; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5971; GFX8-NEXT: s_cbranch_execnz .LBB30_2 5972; GFX8-NEXT: ; %bb.3: ; %Flow 5973; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5974; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5975; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 5976; GFX8-NEXT: .LBB30_4: ; %Flow3 5977; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5978; GFX8-NEXT: s_cbranch_execz .LBB30_6 5979; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private 5980; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5981; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5982; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 5983; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 5984; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen 5985; GFX8-NEXT: s_waitcnt vmcnt(0) 5986; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] 5987; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5988; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 5989; GFX8-NEXT: .LBB30_6: ; %atomicrmw.phi 5990; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5991; GFX8-NEXT: v_mov_b32_e32 v0, v4 5992; GFX8-NEXT: v_mov_b32_e32 v1, v5 5993; GFX8-NEXT: s_waitcnt vmcnt(0) 5994; GFX8-NEXT: s_setpc_b64 s[30:31] 5995; 5996; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: 5997; GFX7: ; %bb.0: 5998; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5999; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 6000; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 6001; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 6002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6003; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 6004; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 6005; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6006; GFX7-NEXT: s_cbranch_execz .LBB30_4 6007; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global 6008; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 6009; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 6010; GFX7-NEXT: flat_load_dword v5, v[4:5] 6011; GFX7-NEXT: flat_load_dword v4, v[0:1] 6012; GFX7-NEXT: s_mov_b64 s[6:7], 0 6013; GFX7-NEXT: .LBB30_2: ; %atomicrmw.start 6014; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6015; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6016; GFX7-NEXT: v_mov_b32_e32 v7, v5 6017; GFX7-NEXT: v_mov_b32_e32 v6, v4 6018; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 6019; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 6020; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6021; GFX7-NEXT: buffer_wbinvl1 6022; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 6023; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6024; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 6025; GFX7-NEXT: s_cbranch_execnz .LBB30_2 6026; GFX7-NEXT: ; %bb.3: ; %Flow 6027; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 6028; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 6029; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 6030; GFX7-NEXT: .LBB30_4: ; %Flow3 6031; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6032; GFX7-NEXT: s_cbranch_execz .LBB30_6 6033; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private 6034; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 6035; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 6036; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 6037; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 6038; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen 6039; GFX7-NEXT: s_waitcnt vmcnt(0) 6040; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] 6041; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 6042; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 6043; GFX7-NEXT: .LBB30_6: ; %atomicrmw.phi 6044; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6045; GFX7-NEXT: v_mov_b32_e32 v0, v4 6046; GFX7-NEXT: v_mov_b32_e32 v1, v5 6047; GFX7-NEXT: s_waitcnt vmcnt(0) 6048; GFX7-NEXT: s_setpc_b64 s[30:31] 6049 %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6050 ret double %result 6051} 6052 6053define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 6054; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6055; GFX12: ; %bb.0: 6056; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6057; GFX12-NEXT: s_wait_expcnt 0x0 6058; GFX12-NEXT: s_wait_samplecnt 0x0 6059; GFX12-NEXT: s_wait_bvhcnt 0x0 6060; GFX12-NEXT: s_wait_kmcnt 0x0 6061; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 6062; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 6063; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 6064; GFX12-NEXT: s_mov_b32 s0, exec_lo 6065; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 6066; GFX12-NEXT: s_wait_alu 0xfffe 6067; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6068; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 6069; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 6070; GFX12-NEXT: s_cbranch_execnz .LBB31_3 6071; GFX12-NEXT: ; %bb.1: ; %Flow3 6072; GFX12-NEXT: s_wait_alu 0xfffe 6073; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6074; GFX12-NEXT: s_cbranch_execnz .LBB31_6 6075; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi 6076; GFX12-NEXT: s_wait_alu 0xfffe 6077; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6078; GFX12-NEXT: s_wait_alu 0xfffe 6079; GFX12-NEXT: s_setpc_b64 s[30:31] 6080; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global 6081; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 6082; GFX12-NEXT: s_mov_b32 s1, 0 6083; GFX12-NEXT: .LBB31_4: ; %atomicrmw.start 6084; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6085; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6086; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 6087; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6088; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] 6089; GFX12-NEXT: s_wait_storecnt 0x0 6090; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6091; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6092; GFX12-NEXT: global_inv scope:SCOPE_DEV 6093; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6094; GFX12-NEXT: s_wait_alu 0xfffe 6095; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 6096; GFX12-NEXT: s_wait_alu 0xfffe 6097; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6098; GFX12-NEXT: s_cbranch_execnz .LBB31_4 6099; GFX12-NEXT: ; %bb.5: ; %Flow 6100; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 6101; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 6102; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 6103; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6104; GFX12-NEXT: s_cbranch_execz .LBB31_2 6105; GFX12-NEXT: .LBB31_6: ; %atomicrmw.private 6106; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6107; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6108; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 6109; GFX12-NEXT: s_wait_loadcnt 0x0 6110; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] 6111; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off 6112; GFX12-NEXT: s_wait_alu 0xfffe 6113; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6114; GFX12-NEXT: s_wait_alu 0xfffe 6115; GFX12-NEXT: s_setpc_b64 s[30:31] 6116; 6117; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6118; GFX940: ; %bb.0: 6119; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6120; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 6121; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 6122; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 6123; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 6124; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 6125; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 6126; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6127; GFX940-NEXT: s_cbranch_execnz .LBB31_3 6128; GFX940-NEXT: ; %bb.1: ; %Flow 6129; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6130; GFX940-NEXT: s_cbranch_execnz .LBB31_4 6131; GFX940-NEXT: .LBB31_2: ; %atomicrmw.phi 6132; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6133; GFX940-NEXT: s_setpc_b64 s[30:31] 6134; GFX940-NEXT: .LBB31_3: ; %atomicrmw.global 6135; GFX940-NEXT: buffer_wbl2 sc1 6136; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 6137; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6138; GFX940-NEXT: buffer_inv sc1 6139; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 6140; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 6141; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6142; GFX940-NEXT: s_cbranch_execz .LBB31_2 6143; GFX940-NEXT: .LBB31_4: ; %atomicrmw.private 6144; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6145; GFX940-NEXT: s_nop 1 6146; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6147; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 6148; GFX940-NEXT: s_waitcnt vmcnt(0) 6149; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6150; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 6151; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6152; GFX940-NEXT: s_waitcnt vmcnt(0) 6153; GFX940-NEXT: s_setpc_b64 s[30:31] 6154; 6155; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6156; GFX11: ; %bb.0: 6157; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6158; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 6159; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 6160; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 6161; GFX11-NEXT: s_mov_b32 s0, exec_lo 6162; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 6163; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6164; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 6165; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 6166; GFX11-NEXT: s_cbranch_execnz .LBB31_3 6167; GFX11-NEXT: ; %bb.1: ; %Flow3 6168; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 6169; GFX11-NEXT: s_cbranch_execnz .LBB31_6 6170; GFX11-NEXT: .LBB31_2: ; %atomicrmw.phi 6171; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6172; GFX11-NEXT: s_setpc_b64 s[30:31] 6173; GFX11-NEXT: .LBB31_3: ; %atomicrmw.global 6174; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 6175; GFX11-NEXT: s_mov_b32 s1, 0 6176; GFX11-NEXT: .LBB31_4: ; %atomicrmw.start 6177; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6178; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6179; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 6180; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6181; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6182; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6183; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 6184; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6185; GFX11-NEXT: buffer_gl1_inv 6186; GFX11-NEXT: buffer_gl0_inv 6187; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6188; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 6189; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6190; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6191; GFX11-NEXT: s_cbranch_execnz .LBB31_4 6192; GFX11-NEXT: ; %bb.5: ; %Flow 6193; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 6194; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 6195; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 6196; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 6197; GFX11-NEXT: s_cbranch_execz .LBB31_2 6198; GFX11-NEXT: .LBB31_6: ; %atomicrmw.private 6199; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6200; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6201; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 6202; GFX11-NEXT: s_waitcnt vmcnt(0) 6203; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6204; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off 6205; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6206; GFX11-NEXT: s_setpc_b64 s[30:31] 6207; 6208; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6209; GFX10: ; %bb.0: 6210; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6211; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 6212; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 6213; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 6214; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 6215; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 6216; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 6217; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 6218; GFX10-NEXT: s_cbranch_execnz .LBB31_3 6219; GFX10-NEXT: ; %bb.1: ; %Flow3 6220; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 6221; GFX10-NEXT: s_cbranch_execnz .LBB31_6 6222; GFX10-NEXT: .LBB31_2: ; %atomicrmw.phi 6223; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6224; GFX10-NEXT: s_setpc_b64 s[30:31] 6225; GFX10-NEXT: .LBB31_3: ; %atomicrmw.global 6226; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 6227; GFX10-NEXT: s_mov_b32 s5, 0 6228; GFX10-NEXT: .LBB31_4: ; %atomicrmw.start 6229; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6230; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6231; GFX10-NEXT: v_mov_b32_e32 v9, v1 6232; GFX10-NEXT: v_mov_b32_e32 v8, v0 6233; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6234; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6235; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6236; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6237; GFX10-NEXT: buffer_gl1_inv 6238; GFX10-NEXT: buffer_gl0_inv 6239; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6240; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 6241; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 6242; GFX10-NEXT: s_cbranch_execnz .LBB31_4 6243; GFX10-NEXT: ; %bb.5: ; %Flow 6244; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 6245; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 6246; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 6247; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 6248; GFX10-NEXT: s_cbranch_execz .LBB31_2 6249; GFX10-NEXT: .LBB31_6: ; %atomicrmw.private 6250; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6251; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6252; GFX10-NEXT: s_clause 0x1 6253; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6254; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6255; GFX10-NEXT: s_waitcnt vmcnt(0) 6256; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6257; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6258; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6259; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6260; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6261; GFX10-NEXT: s_setpc_b64 s[30:31] 6262; 6263; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6264; GFX90A: ; %bb.0: 6265; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6266; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 6267; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6268; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 6269; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 6270; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 6271; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 6272; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6273; GFX90A-NEXT: s_cbranch_execnz .LBB31_3 6274; GFX90A-NEXT: ; %bb.1: ; %Flow 6275; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6276; GFX90A-NEXT: s_cbranch_execnz .LBB31_4 6277; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.phi 6278; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6279; GFX90A-NEXT: s_setpc_b64 s[30:31] 6280; GFX90A-NEXT: .LBB31_3: ; %atomicrmw.global 6281; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc 6282; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6283; GFX90A-NEXT: buffer_wbinvl1 6284; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 6285; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 6286; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6287; GFX90A-NEXT: s_cbranch_execz .LBB31_2 6288; GFX90A-NEXT: .LBB31_4: ; %atomicrmw.private 6289; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6290; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6291; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6292; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6293; GFX90A-NEXT: s_waitcnt vmcnt(0) 6294; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6295; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6296; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6297; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6298; GFX90A-NEXT: s_waitcnt vmcnt(0) 6299; GFX90A-NEXT: s_setpc_b64 s[30:31] 6300; 6301; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6302; GFX908: ; %bb.0: 6303; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6304; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 6305; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6306; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 6307; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 6308; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 6309; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 6310; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6311; GFX908-NEXT: s_cbranch_execnz .LBB31_3 6312; GFX908-NEXT: ; %bb.1: ; %Flow3 6313; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6314; GFX908-NEXT: s_cbranch_execnz .LBB31_6 6315; GFX908-NEXT: .LBB31_2: ; %atomicrmw.phi 6316; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6317; GFX908-NEXT: s_setpc_b64 s[30:31] 6318; GFX908-NEXT: .LBB31_3: ; %atomicrmw.global 6319; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 6320; GFX908-NEXT: s_mov_b64 s[6:7], 0 6321; GFX908-NEXT: .LBB31_4: ; %atomicrmw.start 6322; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6323; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6324; GFX908-NEXT: v_mov_b32_e32 v9, v1 6325; GFX908-NEXT: v_mov_b32_e32 v8, v0 6326; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6327; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6328; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6329; GFX908-NEXT: buffer_wbinvl1 6330; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6331; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6332; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 6333; GFX908-NEXT: s_cbranch_execnz .LBB31_4 6334; GFX908-NEXT: ; %bb.5: ; %Flow 6335; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 6336; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 6337; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 6338; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6339; GFX908-NEXT: s_cbranch_execz .LBB31_2 6340; GFX908-NEXT: .LBB31_6: ; %atomicrmw.private 6341; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6342; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6343; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6344; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6345; GFX908-NEXT: s_waitcnt vmcnt(0) 6346; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6347; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6348; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6349; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6350; GFX908-NEXT: s_waitcnt vmcnt(0) 6351; GFX908-NEXT: s_setpc_b64 s[30:31] 6352; 6353; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6354; GFX8: ; %bb.0: 6355; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6356; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 6357; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 6358; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 6359; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 6360; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6361; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 6362; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6363; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 6364; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6365; GFX8-NEXT: s_cbranch_execnz .LBB31_3 6366; GFX8-NEXT: ; %bb.1: ; %Flow3 6367; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6368; GFX8-NEXT: s_cbranch_execnz .LBB31_6 6369; GFX8-NEXT: .LBB31_2: ; %atomicrmw.phi 6370; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6371; GFX8-NEXT: s_setpc_b64 s[30:31] 6372; GFX8-NEXT: .LBB31_3: ; %atomicrmw.global 6373; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 6374; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 6375; GFX8-NEXT: flat_load_dword v1, v[0:1] 6376; GFX8-NEXT: flat_load_dword v0, v[4:5] 6377; GFX8-NEXT: s_mov_b64 s[6:7], 0 6378; GFX8-NEXT: .LBB31_4: ; %atomicrmw.start 6379; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6380; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6381; GFX8-NEXT: v_mov_b32_e32 v9, v1 6382; GFX8-NEXT: v_mov_b32_e32 v8, v0 6383; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6384; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6385; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6386; GFX8-NEXT: buffer_wbinvl1 6387; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6388; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6389; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 6390; GFX8-NEXT: s_cbranch_execnz .LBB31_4 6391; GFX8-NEXT: ; %bb.5: ; %Flow 6392; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 6393; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 6394; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 6395; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6396; GFX8-NEXT: s_cbranch_execz .LBB31_2 6397; GFX8-NEXT: .LBB31_6: ; %atomicrmw.private 6398; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6399; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6400; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 6401; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6402; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 6403; GFX8-NEXT: s_waitcnt vmcnt(0) 6404; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6405; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6406; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 6407; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6408; GFX8-NEXT: s_waitcnt vmcnt(0) 6409; GFX8-NEXT: s_setpc_b64 s[30:31] 6410; 6411; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 6412; GFX7: ; %bb.0: 6413; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6414; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 6415; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 6416; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 6417; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 6418; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6419; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 6420; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 6421; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 6422; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6423; GFX7-NEXT: s_cbranch_execnz .LBB31_3 6424; GFX7-NEXT: ; %bb.1: ; %Flow3 6425; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6426; GFX7-NEXT: s_cbranch_execnz .LBB31_6 6427; GFX7-NEXT: .LBB31_2: ; %atomicrmw.phi 6428; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6429; GFX7-NEXT: s_setpc_b64 s[30:31] 6430; GFX7-NEXT: .LBB31_3: ; %atomicrmw.global 6431; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 6432; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 6433; GFX7-NEXT: flat_load_dword v1, v[0:1] 6434; GFX7-NEXT: flat_load_dword v0, v[4:5] 6435; GFX7-NEXT: s_mov_b64 s[6:7], 0 6436; GFX7-NEXT: .LBB31_4: ; %atomicrmw.start 6437; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6438; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6439; GFX7-NEXT: v_mov_b32_e32 v9, v1 6440; GFX7-NEXT: v_mov_b32_e32 v8, v0 6441; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6442; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6443; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6444; GFX7-NEXT: buffer_wbinvl1 6445; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6446; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6447; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 6448; GFX7-NEXT: s_cbranch_execnz .LBB31_4 6449; GFX7-NEXT: ; %bb.5: ; %Flow 6450; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 6451; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 6452; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 6453; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6454; GFX7-NEXT: s_cbranch_execz .LBB31_2 6455; GFX7-NEXT: .LBB31_6: ; %atomicrmw.private 6456; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6457; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6458; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 6459; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6460; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 6461; GFX7-NEXT: s_waitcnt vmcnt(0) 6462; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6463; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6464; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 6465; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6466; GFX7-NEXT: s_waitcnt vmcnt(0) 6467; GFX7-NEXT: s_setpc_b64 s[30:31] 6468 %gep = getelementptr double, ptr %ptr, i64 255 6469 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6470 ret double %result 6471} 6472 6473define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 6474; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6475; GFX12: ; %bb.0: 6476; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6477; GFX12-NEXT: s_wait_expcnt 0x0 6478; GFX12-NEXT: s_wait_samplecnt 0x0 6479; GFX12-NEXT: s_wait_bvhcnt 0x0 6480; GFX12-NEXT: s_wait_kmcnt 0x0 6481; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 6482; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 6483; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 6484; GFX12-NEXT: s_mov_b32 s0, exec_lo 6485; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 6486; GFX12-NEXT: s_wait_alu 0xfffe 6487; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6488; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 6489; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 6490; GFX12-NEXT: s_cbranch_execnz .LBB32_3 6491; GFX12-NEXT: ; %bb.1: ; %Flow3 6492; GFX12-NEXT: s_wait_alu 0xfffe 6493; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6494; GFX12-NEXT: s_cbranch_execnz .LBB32_6 6495; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi 6496; GFX12-NEXT: s_wait_alu 0xfffe 6497; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6498; GFX12-NEXT: s_wait_alu 0xfffe 6499; GFX12-NEXT: s_setpc_b64 s[30:31] 6500; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global 6501; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 6502; GFX12-NEXT: s_mov_b32 s1, 0 6503; GFX12-NEXT: .LBB32_4: ; %atomicrmw.start 6504; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6505; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6506; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 6507; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6508; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] 6509; GFX12-NEXT: s_wait_storecnt 0x0 6510; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6511; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6512; GFX12-NEXT: global_inv scope:SCOPE_DEV 6513; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6514; GFX12-NEXT: s_wait_alu 0xfffe 6515; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 6516; GFX12-NEXT: s_wait_alu 0xfffe 6517; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6518; GFX12-NEXT: s_cbranch_execnz .LBB32_4 6519; GFX12-NEXT: ; %bb.5: ; %Flow 6520; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 6521; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 6522; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 6523; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6524; GFX12-NEXT: s_cbranch_execz .LBB32_2 6525; GFX12-NEXT: .LBB32_6: ; %atomicrmw.private 6526; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6527; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6528; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 6529; GFX12-NEXT: s_wait_loadcnt 0x0 6530; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] 6531; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off 6532; GFX12-NEXT: s_wait_alu 0xfffe 6533; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6534; GFX12-NEXT: s_wait_alu 0xfffe 6535; GFX12-NEXT: s_setpc_b64 s[30:31] 6536; 6537; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6538; GFX940: ; %bb.0: 6539; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6540; GFX940-NEXT: s_movk_i32 s0, 0xf800 6541; GFX940-NEXT: s_mov_b32 s1, -1 6542; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 6543; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 6544; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 6545; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 6546; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 6547; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6548; GFX940-NEXT: s_cbranch_execnz .LBB32_3 6549; GFX940-NEXT: ; %bb.1: ; %Flow 6550; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6551; GFX940-NEXT: s_cbranch_execnz .LBB32_4 6552; GFX940-NEXT: .LBB32_2: ; %atomicrmw.phi 6553; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6554; GFX940-NEXT: s_setpc_b64 s[30:31] 6555; GFX940-NEXT: .LBB32_3: ; %atomicrmw.global 6556; GFX940-NEXT: buffer_wbl2 sc1 6557; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 6558; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6559; GFX940-NEXT: buffer_inv sc1 6560; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 6561; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 6562; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6563; GFX940-NEXT: s_cbranch_execz .LBB32_2 6564; GFX940-NEXT: .LBB32_4: ; %atomicrmw.private 6565; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6566; GFX940-NEXT: s_nop 1 6567; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6568; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 6569; GFX940-NEXT: s_waitcnt vmcnt(0) 6570; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6571; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 6572; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6573; GFX940-NEXT: s_waitcnt vmcnt(0) 6574; GFX940-NEXT: s_setpc_b64 s[30:31] 6575; 6576; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6577; GFX11: ; %bb.0: 6578; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6579; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 6580; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 6581; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 6582; GFX11-NEXT: s_mov_b32 s0, exec_lo 6583; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 6584; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6585; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 6586; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 6587; GFX11-NEXT: s_cbranch_execnz .LBB32_3 6588; GFX11-NEXT: ; %bb.1: ; %Flow3 6589; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 6590; GFX11-NEXT: s_cbranch_execnz .LBB32_6 6591; GFX11-NEXT: .LBB32_2: ; %atomicrmw.phi 6592; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6593; GFX11-NEXT: s_setpc_b64 s[30:31] 6594; GFX11-NEXT: .LBB32_3: ; %atomicrmw.global 6595; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 6596; GFX11-NEXT: s_mov_b32 s1, 0 6597; GFX11-NEXT: .LBB32_4: ; %atomicrmw.start 6598; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6599; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6600; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 6601; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6602; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6603; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6604; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 6605; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6606; GFX11-NEXT: buffer_gl1_inv 6607; GFX11-NEXT: buffer_gl0_inv 6608; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6609; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 6610; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6611; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6612; GFX11-NEXT: s_cbranch_execnz .LBB32_4 6613; GFX11-NEXT: ; %bb.5: ; %Flow 6614; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 6615; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 6616; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 6617; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 6618; GFX11-NEXT: s_cbranch_execz .LBB32_2 6619; GFX11-NEXT: .LBB32_6: ; %atomicrmw.private 6620; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6621; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6622; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 6623; GFX11-NEXT: s_waitcnt vmcnt(0) 6624; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6625; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off 6626; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6627; GFX11-NEXT: s_setpc_b64 s[30:31] 6628; 6629; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6630; GFX10: ; %bb.0: 6631; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6632; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 6633; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 6634; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 6635; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 6636; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 6637; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 6638; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 6639; GFX10-NEXT: s_cbranch_execnz .LBB32_3 6640; GFX10-NEXT: ; %bb.1: ; %Flow3 6641; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 6642; GFX10-NEXT: s_cbranch_execnz .LBB32_6 6643; GFX10-NEXT: .LBB32_2: ; %atomicrmw.phi 6644; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6645; GFX10-NEXT: s_setpc_b64 s[30:31] 6646; GFX10-NEXT: .LBB32_3: ; %atomicrmw.global 6647; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 6648; GFX10-NEXT: s_mov_b32 s5, 0 6649; GFX10-NEXT: .LBB32_4: ; %atomicrmw.start 6650; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6651; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6652; GFX10-NEXT: v_mov_b32_e32 v9, v1 6653; GFX10-NEXT: v_mov_b32_e32 v8, v0 6654; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6655; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6656; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6657; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6658; GFX10-NEXT: buffer_gl1_inv 6659; GFX10-NEXT: buffer_gl0_inv 6660; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 6661; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 6662; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 6663; GFX10-NEXT: s_cbranch_execnz .LBB32_4 6664; GFX10-NEXT: ; %bb.5: ; %Flow 6665; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 6666; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 6667; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 6668; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 6669; GFX10-NEXT: s_cbranch_execz .LBB32_2 6670; GFX10-NEXT: .LBB32_6: ; %atomicrmw.private 6671; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 6672; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 6673; GFX10-NEXT: s_clause 0x1 6674; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6675; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6676; GFX10-NEXT: s_waitcnt vmcnt(0) 6677; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6678; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6679; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6680; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6681; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6682; GFX10-NEXT: s_setpc_b64 s[30:31] 6683; 6684; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6685; GFX90A: ; %bb.0: 6686; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6687; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 6688; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 6689; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 6690; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 6691; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 6692; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 6693; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6694; GFX90A-NEXT: s_cbranch_execnz .LBB32_3 6695; GFX90A-NEXT: ; %bb.1: ; %Flow 6696; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6697; GFX90A-NEXT: s_cbranch_execnz .LBB32_4 6698; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.phi 6699; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6700; GFX90A-NEXT: s_setpc_b64 s[30:31] 6701; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.global 6702; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc 6703; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6704; GFX90A-NEXT: buffer_wbinvl1 6705; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 6706; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 6707; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6708; GFX90A-NEXT: s_cbranch_execz .LBB32_2 6709; GFX90A-NEXT: .LBB32_4: ; %atomicrmw.private 6710; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6711; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6712; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6713; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6714; GFX90A-NEXT: s_waitcnt vmcnt(0) 6715; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6716; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6717; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6718; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6719; GFX90A-NEXT: s_waitcnt vmcnt(0) 6720; GFX90A-NEXT: s_setpc_b64 s[30:31] 6721; 6722; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6723; GFX908: ; %bb.0: 6724; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6725; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 6726; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 6727; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 6728; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 6729; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 6730; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 6731; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6732; GFX908-NEXT: s_cbranch_execnz .LBB32_3 6733; GFX908-NEXT: ; %bb.1: ; %Flow3 6734; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6735; GFX908-NEXT: s_cbranch_execnz .LBB32_6 6736; GFX908-NEXT: .LBB32_2: ; %atomicrmw.phi 6737; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6738; GFX908-NEXT: s_setpc_b64 s[30:31] 6739; GFX908-NEXT: .LBB32_3: ; %atomicrmw.global 6740; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 6741; GFX908-NEXT: s_mov_b64 s[6:7], 0 6742; GFX908-NEXT: .LBB32_4: ; %atomicrmw.start 6743; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6744; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6745; GFX908-NEXT: v_mov_b32_e32 v9, v1 6746; GFX908-NEXT: v_mov_b32_e32 v8, v0 6747; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6748; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6749; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6750; GFX908-NEXT: buffer_wbinvl1 6751; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6752; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6753; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 6754; GFX908-NEXT: s_cbranch_execnz .LBB32_4 6755; GFX908-NEXT: ; %bb.5: ; %Flow 6756; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 6757; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 6758; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 6759; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6760; GFX908-NEXT: s_cbranch_execz .LBB32_2 6761; GFX908-NEXT: .LBB32_6: ; %atomicrmw.private 6762; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6763; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6764; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6765; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 6766; GFX908-NEXT: s_waitcnt vmcnt(0) 6767; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6768; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6769; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 6770; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6771; GFX908-NEXT: s_waitcnt vmcnt(0) 6772; GFX908-NEXT: s_setpc_b64 s[30:31] 6773; 6774; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6775; GFX8: ; %bb.0: 6776; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6777; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 6778; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 6779; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 6780; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 6781; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6782; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 6783; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6784; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 6785; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6786; GFX8-NEXT: s_cbranch_execnz .LBB32_3 6787; GFX8-NEXT: ; %bb.1: ; %Flow3 6788; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6789; GFX8-NEXT: s_cbranch_execnz .LBB32_6 6790; GFX8-NEXT: .LBB32_2: ; %atomicrmw.phi 6791; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6792; GFX8-NEXT: s_setpc_b64 s[30:31] 6793; GFX8-NEXT: .LBB32_3: ; %atomicrmw.global 6794; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 6795; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 6796; GFX8-NEXT: flat_load_dword v1, v[0:1] 6797; GFX8-NEXT: flat_load_dword v0, v[4:5] 6798; GFX8-NEXT: s_mov_b64 s[6:7], 0 6799; GFX8-NEXT: .LBB32_4: ; %atomicrmw.start 6800; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6801; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6802; GFX8-NEXT: v_mov_b32_e32 v9, v1 6803; GFX8-NEXT: v_mov_b32_e32 v8, v0 6804; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6805; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6806; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6807; GFX8-NEXT: buffer_wbinvl1 6808; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6809; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6810; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 6811; GFX8-NEXT: s_cbranch_execnz .LBB32_4 6812; GFX8-NEXT: ; %bb.5: ; %Flow 6813; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 6814; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 6815; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 6816; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6817; GFX8-NEXT: s_cbranch_execz .LBB32_2 6818; GFX8-NEXT: .LBB32_6: ; %atomicrmw.private 6819; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6820; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6821; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 6822; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6823; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 6824; GFX8-NEXT: s_waitcnt vmcnt(0) 6825; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6826; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6827; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 6828; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6829; GFX8-NEXT: s_waitcnt vmcnt(0) 6830; GFX8-NEXT: s_setpc_b64 s[30:31] 6831; 6832; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 6833; GFX7: ; %bb.0: 6834; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6835; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 6836; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 6837; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 6838; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 6839; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6840; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 6841; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 6842; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 6843; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6844; GFX7-NEXT: s_cbranch_execnz .LBB32_3 6845; GFX7-NEXT: ; %bb.1: ; %Flow3 6846; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6847; GFX7-NEXT: s_cbranch_execnz .LBB32_6 6848; GFX7-NEXT: .LBB32_2: ; %atomicrmw.phi 6849; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6850; GFX7-NEXT: s_setpc_b64 s[30:31] 6851; GFX7-NEXT: .LBB32_3: ; %atomicrmw.global 6852; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 6853; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 6854; GFX7-NEXT: flat_load_dword v1, v[0:1] 6855; GFX7-NEXT: flat_load_dword v0, v[4:5] 6856; GFX7-NEXT: s_mov_b64 s[6:7], 0 6857; GFX7-NEXT: .LBB32_4: ; %atomicrmw.start 6858; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6859; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6860; GFX7-NEXT: v_mov_b32_e32 v9, v1 6861; GFX7-NEXT: v_mov_b32_e32 v8, v0 6862; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] 6863; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 6864; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6865; GFX7-NEXT: buffer_wbinvl1 6866; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 6867; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6868; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 6869; GFX7-NEXT: s_cbranch_execnz .LBB32_4 6870; GFX7-NEXT: ; %bb.5: ; %Flow 6871; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 6872; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 6873; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 6874; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 6875; GFX7-NEXT: s_cbranch_execz .LBB32_2 6876; GFX7-NEXT: .LBB32_6: ; %atomicrmw.private 6877; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 6878; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 6879; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 6880; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 6881; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 6882; GFX7-NEXT: s_waitcnt vmcnt(0) 6883; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 6884; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 6885; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 6886; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6887; GFX7-NEXT: s_waitcnt vmcnt(0) 6888; GFX7-NEXT: s_setpc_b64 s[30:31] 6889 %gep = getelementptr double, ptr %ptr, i64 -256 6890 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6891 ret double %result 6892} 6893 6894define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 6895; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 6896; GFX12: ; %bb.0: 6897; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6898; GFX12-NEXT: s_wait_expcnt 0x0 6899; GFX12-NEXT: s_wait_samplecnt 0x0 6900; GFX12-NEXT: s_wait_bvhcnt 0x0 6901; GFX12-NEXT: s_wait_kmcnt 0x0 6902; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 6903; GFX12-NEXT: s_mov_b32 s0, exec_lo 6904; GFX12-NEXT: s_wait_alu 0xfffe 6905; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 6906; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 6907; GFX12-NEXT: s_cbranch_execnz .LBB33_3 6908; GFX12-NEXT: ; %bb.1: ; %Flow3 6909; GFX12-NEXT: s_wait_alu 0xfffe 6910; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6911; GFX12-NEXT: s_cbranch_execnz .LBB33_6 6912; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi 6913; GFX12-NEXT: s_wait_alu 0xfffe 6914; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6915; GFX12-NEXT: s_wait_alu 0xfffe 6916; GFX12-NEXT: s_setpc_b64 s[30:31] 6917; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global 6918; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 6919; GFX12-NEXT: s_mov_b32 s1, 0 6920; GFX12-NEXT: .LBB33_4: ; %atomicrmw.start 6921; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6922; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6923; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] 6924; GFX12-NEXT: s_wait_storecnt 0x0 6925; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6926; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6927; GFX12-NEXT: global_inv scope:SCOPE_DEV 6928; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 6929; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 6930; GFX12-NEXT: s_wait_alu 0xfffe 6931; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 6932; GFX12-NEXT: s_wait_alu 0xfffe 6933; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6934; GFX12-NEXT: s_cbranch_execnz .LBB33_4 6935; GFX12-NEXT: ; %bb.5: ; %Flow 6936; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 6937; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 6938; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 6939; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 6940; GFX12-NEXT: s_cbranch_execz .LBB33_2 6941; GFX12-NEXT: .LBB33_6: ; %atomicrmw.private 6942; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 6943; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 6944; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 6945; GFX12-NEXT: s_wait_loadcnt 0x0 6946; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] 6947; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 6948; GFX12-NEXT: s_wait_alu 0xfffe 6949; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6950; GFX12-NEXT: s_wait_alu 0xfffe 6951; GFX12-NEXT: s_setpc_b64 s[30:31] 6952; 6953; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 6954; GFX940: ; %bb.0: 6955; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6956; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 6957; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 6958; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 6959; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6960; GFX940-NEXT: s_cbranch_execnz .LBB33_3 6961; GFX940-NEXT: ; %bb.1: ; %Flow 6962; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6963; GFX940-NEXT: s_cbranch_execnz .LBB33_4 6964; GFX940-NEXT: .LBB33_2: ; %atomicrmw.phi 6965; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6966; GFX940-NEXT: s_setpc_b64 s[30:31] 6967; GFX940-NEXT: .LBB33_3: ; %atomicrmw.global 6968; GFX940-NEXT: buffer_wbl2 sc1 6969; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 6970; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6971; GFX940-NEXT: buffer_inv sc1 6972; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 6973; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 6974; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 6975; GFX940-NEXT: s_cbranch_execz .LBB33_2 6976; GFX940-NEXT: .LBB33_4: ; %atomicrmw.private 6977; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 6978; GFX940-NEXT: s_nop 1 6979; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 6980; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 6981; GFX940-NEXT: s_waitcnt vmcnt(0) 6982; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 6983; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 6984; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6985; GFX940-NEXT: s_waitcnt vmcnt(0) 6986; GFX940-NEXT: s_setpc_b64 s[30:31] 6987; 6988; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 6989; GFX11: ; %bb.0: 6990; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6991; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 6992; GFX11-NEXT: s_mov_b32 s0, exec_lo 6993; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 6994; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 6995; GFX11-NEXT: s_cbranch_execnz .LBB33_3 6996; GFX11-NEXT: ; %bb.1: ; %Flow3 6997; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 6998; GFX11-NEXT: s_cbranch_execnz .LBB33_6 6999; GFX11-NEXT: .LBB33_2: ; %atomicrmw.phi 7000; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7001; GFX11-NEXT: s_setpc_b64 s[30:31] 7002; GFX11-NEXT: .LBB33_3: ; %atomicrmw.global 7003; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 7004; GFX11-NEXT: s_mov_b32 s1, 0 7005; GFX11-NEXT: .LBB33_4: ; %atomicrmw.start 7006; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7007; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7008; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7009; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7010; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 7011; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7012; GFX11-NEXT: buffer_gl1_inv 7013; GFX11-NEXT: buffer_gl0_inv 7014; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7015; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 7016; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 7017; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7018; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7019; GFX11-NEXT: s_cbranch_execnz .LBB33_4 7020; GFX11-NEXT: ; %bb.5: ; %Flow 7021; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 7022; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 7023; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 7024; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 7025; GFX11-NEXT: s_cbranch_execz .LBB33_2 7026; GFX11-NEXT: .LBB33_6: ; %atomicrmw.private 7027; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7028; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7029; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 7030; GFX11-NEXT: s_waitcnt vmcnt(0) 7031; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7032; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 7033; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7034; GFX11-NEXT: s_setpc_b64 s[30:31] 7035; 7036; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 7037; GFX10: ; %bb.0: 7038; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7039; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 7040; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 7041; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 7042; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 7043; GFX10-NEXT: s_cbranch_execnz .LBB33_3 7044; GFX10-NEXT: ; %bb.1: ; %Flow3 7045; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7046; GFX10-NEXT: s_cbranch_execnz .LBB33_6 7047; GFX10-NEXT: .LBB33_2: ; %atomicrmw.phi 7048; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7049; GFX10-NEXT: s_setpc_b64 s[30:31] 7050; GFX10-NEXT: .LBB33_3: ; %atomicrmw.global 7051; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7052; GFX10-NEXT: s_mov_b32 s5, 0 7053; GFX10-NEXT: .LBB33_4: ; %atomicrmw.start 7054; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7055; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7056; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7057; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7058; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7059; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7060; GFX10-NEXT: buffer_gl1_inv 7061; GFX10-NEXT: buffer_gl0_inv 7062; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7063; GFX10-NEXT: v_mov_b32_e32 v7, v5 7064; GFX10-NEXT: v_mov_b32_e32 v6, v4 7065; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 7066; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 7067; GFX10-NEXT: s_cbranch_execnz .LBB33_4 7068; GFX10-NEXT: ; %bb.5: ; %Flow 7069; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 7070; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 7071; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 7072; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7073; GFX10-NEXT: s_cbranch_execz .LBB33_2 7074; GFX10-NEXT: .LBB33_6: ; %atomicrmw.private 7075; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7076; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7077; GFX10-NEXT: s_clause 0x1 7078; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7079; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7080; GFX10-NEXT: s_waitcnt vmcnt(0) 7081; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7082; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7083; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7084; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7085; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7086; GFX10-NEXT: s_setpc_b64 s[30:31] 7087; 7088; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 7089; GFX90A: ; %bb.0: 7090; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7091; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 7092; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7093; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 7094; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7095; GFX90A-NEXT: s_cbranch_execnz .LBB33_3 7096; GFX90A-NEXT: ; %bb.1: ; %Flow 7097; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7098; GFX90A-NEXT: s_cbranch_execnz .LBB33_4 7099; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.phi 7100; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7101; GFX90A-NEXT: s_setpc_b64 s[30:31] 7102; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.global 7103; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 7104; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7105; GFX90A-NEXT: buffer_wbinvl1 7106; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 7107; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 7108; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7109; GFX90A-NEXT: s_cbranch_execz .LBB33_2 7110; GFX90A-NEXT: .LBB33_4: ; %atomicrmw.private 7111; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7112; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7113; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7114; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7115; GFX90A-NEXT: s_waitcnt vmcnt(0) 7116; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7117; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7118; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7119; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7120; GFX90A-NEXT: s_waitcnt vmcnt(0) 7121; GFX90A-NEXT: s_setpc_b64 s[30:31] 7122; 7123; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 7124; GFX908: ; %bb.0: 7125; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7126; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 7127; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7128; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 7129; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7130; GFX908-NEXT: s_cbranch_execnz .LBB33_3 7131; GFX908-NEXT: ; %bb.1: ; %Flow3 7132; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7133; GFX908-NEXT: s_cbranch_execnz .LBB33_6 7134; GFX908-NEXT: .LBB33_2: ; %atomicrmw.phi 7135; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7136; GFX908-NEXT: s_setpc_b64 s[30:31] 7137; GFX908-NEXT: .LBB33_3: ; %atomicrmw.global 7138; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7139; GFX908-NEXT: s_mov_b64 s[6:7], 0 7140; GFX908-NEXT: .LBB33_4: ; %atomicrmw.start 7141; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7142; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7143; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7144; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7145; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7146; GFX908-NEXT: buffer_wbinvl1 7147; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7148; GFX908-NEXT: v_mov_b32_e32 v7, v5 7149; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7150; GFX908-NEXT: v_mov_b32_e32 v6, v4 7151; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7152; GFX908-NEXT: s_cbranch_execnz .LBB33_4 7153; GFX908-NEXT: ; %bb.5: ; %Flow 7154; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7155; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 7156; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 7157; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7158; GFX908-NEXT: s_cbranch_execz .LBB33_2 7159; GFX908-NEXT: .LBB33_6: ; %atomicrmw.private 7160; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7161; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7162; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7163; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7164; GFX908-NEXT: s_waitcnt vmcnt(0) 7165; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7166; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7167; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7168; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7169; GFX908-NEXT: s_waitcnt vmcnt(0) 7170; GFX908-NEXT: s_setpc_b64 s[30:31] 7171; 7172; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 7173; GFX8: ; %bb.0: 7174; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7175; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 7176; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 7177; GFX8-NEXT: s_waitcnt lgkmcnt(0) 7178; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 7179; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 7180; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7181; GFX8-NEXT: s_cbranch_execnz .LBB33_3 7182; GFX8-NEXT: ; %bb.1: ; %Flow3 7183; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7184; GFX8-NEXT: s_cbranch_execnz .LBB33_6 7185; GFX8-NEXT: .LBB33_2: ; %atomicrmw.phi 7186; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7187; GFX8-NEXT: s_setpc_b64 s[30:31] 7188; GFX8-NEXT: .LBB33_3: ; %atomicrmw.global 7189; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 7190; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7191; GFX8-NEXT: flat_load_dword v7, v[4:5] 7192; GFX8-NEXT: flat_load_dword v6, v[0:1] 7193; GFX8-NEXT: s_mov_b64 s[6:7], 0 7194; GFX8-NEXT: .LBB33_4: ; %atomicrmw.start 7195; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7196; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7197; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7198; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7199; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7200; GFX8-NEXT: buffer_wbinvl1 7201; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7202; GFX8-NEXT: v_mov_b32_e32 v7, v5 7203; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7204; GFX8-NEXT: v_mov_b32_e32 v6, v4 7205; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 7206; GFX8-NEXT: s_cbranch_execnz .LBB33_4 7207; GFX8-NEXT: ; %bb.5: ; %Flow 7208; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 7209; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 7210; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 7211; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7212; GFX8-NEXT: s_cbranch_execz .LBB33_2 7213; GFX8-NEXT: .LBB33_6: ; %atomicrmw.private 7214; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7215; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7216; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 7217; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7218; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 7219; GFX8-NEXT: s_waitcnt vmcnt(0) 7220; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7221; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7222; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 7223; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7224; GFX8-NEXT: s_waitcnt vmcnt(0) 7225; GFX8-NEXT: s_setpc_b64 s[30:31] 7226; 7227; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: 7228; GFX7: ; %bb.0: 7229; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7230; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 7231; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 7232; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7233; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 7234; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 7235; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7236; GFX7-NEXT: s_cbranch_execnz .LBB33_3 7237; GFX7-NEXT: ; %bb.1: ; %Flow3 7238; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7239; GFX7-NEXT: s_cbranch_execnz .LBB33_6 7240; GFX7-NEXT: .LBB33_2: ; %atomicrmw.phi 7241; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7242; GFX7-NEXT: s_setpc_b64 s[30:31] 7243; GFX7-NEXT: .LBB33_3: ; %atomicrmw.global 7244; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 7245; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7246; GFX7-NEXT: flat_load_dword v7, v[4:5] 7247; GFX7-NEXT: flat_load_dword v6, v[0:1] 7248; GFX7-NEXT: s_mov_b64 s[6:7], 0 7249; GFX7-NEXT: .LBB33_4: ; %atomicrmw.start 7250; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7251; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7252; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7253; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7254; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7255; GFX7-NEXT: buffer_wbinvl1 7256; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7257; GFX7-NEXT: v_mov_b32_e32 v7, v5 7258; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7259; GFX7-NEXT: v_mov_b32_e32 v6, v4 7260; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 7261; GFX7-NEXT: s_cbranch_execnz .LBB33_4 7262; GFX7-NEXT: ; %bb.5: ; %Flow 7263; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 7264; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 7265; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 7266; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7267; GFX7-NEXT: s_cbranch_execz .LBB33_2 7268; GFX7-NEXT: .LBB33_6: ; %atomicrmw.private 7269; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7270; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7271; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 7272; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7273; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 7274; GFX7-NEXT: s_waitcnt vmcnt(0) 7275; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7276; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7277; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 7278; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7279; GFX7-NEXT: s_waitcnt vmcnt(0) 7280; GFX7-NEXT: s_setpc_b64 s[30:31] 7281 %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7282 ret void 7283} 7284 7285define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 7286; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7287; GFX12: ; %bb.0: 7288; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7289; GFX12-NEXT: s_wait_expcnt 0x0 7290; GFX12-NEXT: s_wait_samplecnt 0x0 7291; GFX12-NEXT: s_wait_bvhcnt 0x0 7292; GFX12-NEXT: s_wait_kmcnt 0x0 7293; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 7294; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7295; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 7296; GFX12-NEXT: s_mov_b32 s0, exec_lo 7297; GFX12-NEXT: s_wait_alu 0xfffe 7298; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7299; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 7300; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 7301; GFX12-NEXT: s_cbranch_execnz .LBB34_3 7302; GFX12-NEXT: ; %bb.1: ; %Flow3 7303; GFX12-NEXT: s_wait_alu 0xfffe 7304; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 7305; GFX12-NEXT: s_cbranch_execnz .LBB34_6 7306; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi 7307; GFX12-NEXT: s_wait_alu 0xfffe 7308; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7309; GFX12-NEXT: s_wait_alu 0xfffe 7310; GFX12-NEXT: s_setpc_b64 s[30:31] 7311; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global 7312; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 7313; GFX12-NEXT: s_mov_b32 s1, 0 7314; GFX12-NEXT: .LBB34_4: ; %atomicrmw.start 7315; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7316; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7317; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] 7318; GFX12-NEXT: s_wait_storecnt 0x0 7319; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7320; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7321; GFX12-NEXT: global_inv scope:SCOPE_DEV 7322; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7323; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 7324; GFX12-NEXT: s_wait_alu 0xfffe 7325; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 7326; GFX12-NEXT: s_wait_alu 0xfffe 7327; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7328; GFX12-NEXT: s_cbranch_execnz .LBB34_4 7329; GFX12-NEXT: ; %bb.5: ; %Flow 7330; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 7331; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 7332; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 7333; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 7334; GFX12-NEXT: s_cbranch_execz .LBB34_2 7335; GFX12-NEXT: .LBB34_6: ; %atomicrmw.private 7336; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7337; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7338; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 7339; GFX12-NEXT: s_wait_loadcnt 0x0 7340; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] 7341; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 7342; GFX12-NEXT: s_wait_alu 0xfffe 7343; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7344; GFX12-NEXT: s_wait_alu 0xfffe 7345; GFX12-NEXT: s_setpc_b64 s[30:31] 7346; 7347; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7348; GFX940: ; %bb.0: 7349; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7350; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 7351; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 7352; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 7353; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 7354; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 7355; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7356; GFX940-NEXT: s_cbranch_execnz .LBB34_3 7357; GFX940-NEXT: ; %bb.1: ; %Flow 7358; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 7359; GFX940-NEXT: s_cbranch_execnz .LBB34_4 7360; GFX940-NEXT: .LBB34_2: ; %atomicrmw.phi 7361; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7362; GFX940-NEXT: s_setpc_b64 s[30:31] 7363; GFX940-NEXT: .LBB34_3: ; %atomicrmw.global 7364; GFX940-NEXT: buffer_wbl2 sc1 7365; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 7366; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7367; GFX940-NEXT: buffer_inv sc1 7368; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 7369; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 7370; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 7371; GFX940-NEXT: s_cbranch_execz .LBB34_2 7372; GFX940-NEXT: .LBB34_4: ; %atomicrmw.private 7373; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7374; GFX940-NEXT: s_nop 1 7375; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7376; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 7377; GFX940-NEXT: s_waitcnt vmcnt(0) 7378; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7379; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 7380; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7381; GFX940-NEXT: s_waitcnt vmcnt(0) 7382; GFX940-NEXT: s_setpc_b64 s[30:31] 7383; 7384; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7385; GFX11: ; %bb.0: 7386; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7387; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 7388; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7389; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 7390; GFX11-NEXT: s_mov_b32 s0, exec_lo 7391; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7392; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 7393; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 7394; GFX11-NEXT: s_cbranch_execnz .LBB34_3 7395; GFX11-NEXT: ; %bb.1: ; %Flow3 7396; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 7397; GFX11-NEXT: s_cbranch_execnz .LBB34_6 7398; GFX11-NEXT: .LBB34_2: ; %atomicrmw.phi 7399; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7400; GFX11-NEXT: s_setpc_b64 s[30:31] 7401; GFX11-NEXT: .LBB34_3: ; %atomicrmw.global 7402; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 7403; GFX11-NEXT: s_mov_b32 s1, 0 7404; GFX11-NEXT: .LBB34_4: ; %atomicrmw.start 7405; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7406; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7407; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7408; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7409; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 7410; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7411; GFX11-NEXT: buffer_gl1_inv 7412; GFX11-NEXT: buffer_gl0_inv 7413; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7414; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 7415; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 7416; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7417; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7418; GFX11-NEXT: s_cbranch_execnz .LBB34_4 7419; GFX11-NEXT: ; %bb.5: ; %Flow 7420; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 7421; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 7422; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 7423; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 7424; GFX11-NEXT: s_cbranch_execz .LBB34_2 7425; GFX11-NEXT: .LBB34_6: ; %atomicrmw.private 7426; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7427; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7428; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 7429; GFX11-NEXT: s_waitcnt vmcnt(0) 7430; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7431; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 7432; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7433; GFX11-NEXT: s_setpc_b64 s[30:31] 7434; 7435; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7436; GFX10: ; %bb.0: 7437; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7438; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 7439; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7440; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 7441; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 7442; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 7443; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 7444; GFX10-NEXT: s_cbranch_execnz .LBB34_3 7445; GFX10-NEXT: ; %bb.1: ; %Flow3 7446; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7447; GFX10-NEXT: s_cbranch_execnz .LBB34_6 7448; GFX10-NEXT: .LBB34_2: ; %atomicrmw.phi 7449; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7450; GFX10-NEXT: s_setpc_b64 s[30:31] 7451; GFX10-NEXT: .LBB34_3: ; %atomicrmw.global 7452; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7453; GFX10-NEXT: s_mov_b32 s5, 0 7454; GFX10-NEXT: .LBB34_4: ; %atomicrmw.start 7455; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7456; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7457; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7458; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7459; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7460; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7461; GFX10-NEXT: buffer_gl1_inv 7462; GFX10-NEXT: buffer_gl0_inv 7463; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7464; GFX10-NEXT: v_mov_b32_e32 v7, v5 7465; GFX10-NEXT: v_mov_b32_e32 v6, v4 7466; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 7467; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 7468; GFX10-NEXT: s_cbranch_execnz .LBB34_4 7469; GFX10-NEXT: ; %bb.5: ; %Flow 7470; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 7471; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 7472; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 7473; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7474; GFX10-NEXT: s_cbranch_execz .LBB34_2 7475; GFX10-NEXT: .LBB34_6: ; %atomicrmw.private 7476; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7477; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7478; GFX10-NEXT: s_clause 0x1 7479; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7480; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7481; GFX10-NEXT: s_waitcnt vmcnt(0) 7482; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7483; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7484; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7485; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7486; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7487; GFX10-NEXT: s_setpc_b64 s[30:31] 7488; 7489; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7490; GFX90A: ; %bb.0: 7491; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7492; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 7493; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7494; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 7495; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7496; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 7497; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7498; GFX90A-NEXT: s_cbranch_execnz .LBB34_3 7499; GFX90A-NEXT: ; %bb.1: ; %Flow 7500; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7501; GFX90A-NEXT: s_cbranch_execnz .LBB34_4 7502; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.phi 7503; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7504; GFX90A-NEXT: s_setpc_b64 s[30:31] 7505; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.global 7506; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 7507; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7508; GFX90A-NEXT: buffer_wbinvl1 7509; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 7510; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 7511; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7512; GFX90A-NEXT: s_cbranch_execz .LBB34_2 7513; GFX90A-NEXT: .LBB34_4: ; %atomicrmw.private 7514; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7515; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7516; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7517; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7518; GFX90A-NEXT: s_waitcnt vmcnt(0) 7519; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7520; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7521; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7522; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7523; GFX90A-NEXT: s_waitcnt vmcnt(0) 7524; GFX90A-NEXT: s_setpc_b64 s[30:31] 7525; 7526; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7527; GFX908: ; %bb.0: 7528; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7529; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 7530; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7531; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 7532; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7533; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 7534; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7535; GFX908-NEXT: s_cbranch_execnz .LBB34_3 7536; GFX908-NEXT: ; %bb.1: ; %Flow3 7537; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7538; GFX908-NEXT: s_cbranch_execnz .LBB34_6 7539; GFX908-NEXT: .LBB34_2: ; %atomicrmw.phi 7540; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7541; GFX908-NEXT: s_setpc_b64 s[30:31] 7542; GFX908-NEXT: .LBB34_3: ; %atomicrmw.global 7543; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7544; GFX908-NEXT: s_mov_b64 s[6:7], 0 7545; GFX908-NEXT: .LBB34_4: ; %atomicrmw.start 7546; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7547; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7548; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7549; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7550; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7551; GFX908-NEXT: buffer_wbinvl1 7552; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7553; GFX908-NEXT: v_mov_b32_e32 v7, v5 7554; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7555; GFX908-NEXT: v_mov_b32_e32 v6, v4 7556; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7557; GFX908-NEXT: s_cbranch_execnz .LBB34_4 7558; GFX908-NEXT: ; %bb.5: ; %Flow 7559; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7560; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 7561; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 7562; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7563; GFX908-NEXT: s_cbranch_execz .LBB34_2 7564; GFX908-NEXT: .LBB34_6: ; %atomicrmw.private 7565; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7566; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7567; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7568; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7569; GFX908-NEXT: s_waitcnt vmcnt(0) 7570; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7571; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7572; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7573; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7574; GFX908-NEXT: s_waitcnt vmcnt(0) 7575; GFX908-NEXT: s_setpc_b64 s[30:31] 7576; 7577; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7578; GFX8: ; %bb.0: 7579; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7580; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 7581; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 7582; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 7583; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 7585; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 7586; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 7587; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7588; GFX8-NEXT: s_cbranch_execnz .LBB34_3 7589; GFX8-NEXT: ; %bb.1: ; %Flow3 7590; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7591; GFX8-NEXT: s_cbranch_execnz .LBB34_6 7592; GFX8-NEXT: .LBB34_2: ; %atomicrmw.phi 7593; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7594; GFX8-NEXT: s_setpc_b64 s[30:31] 7595; GFX8-NEXT: .LBB34_3: ; %atomicrmw.global 7596; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 7597; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7598; GFX8-NEXT: flat_load_dword v7, v[4:5] 7599; GFX8-NEXT: flat_load_dword v6, v[0:1] 7600; GFX8-NEXT: s_mov_b64 s[6:7], 0 7601; GFX8-NEXT: .LBB34_4: ; %atomicrmw.start 7602; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7603; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7604; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7605; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7606; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7607; GFX8-NEXT: buffer_wbinvl1 7608; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7609; GFX8-NEXT: v_mov_b32_e32 v7, v5 7610; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7611; GFX8-NEXT: v_mov_b32_e32 v6, v4 7612; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 7613; GFX8-NEXT: s_cbranch_execnz .LBB34_4 7614; GFX8-NEXT: ; %bb.5: ; %Flow 7615; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 7616; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 7617; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 7618; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7619; GFX8-NEXT: s_cbranch_execz .LBB34_2 7620; GFX8-NEXT: .LBB34_6: ; %atomicrmw.private 7621; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7622; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7623; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 7624; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7625; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 7626; GFX8-NEXT: s_waitcnt vmcnt(0) 7627; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7628; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7629; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 7630; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7631; GFX8-NEXT: s_waitcnt vmcnt(0) 7632; GFX8-NEXT: s_setpc_b64 s[30:31] 7633; 7634; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 7635; GFX7: ; %bb.0: 7636; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7637; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 7638; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 7639; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 7640; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7641; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7642; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 7643; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 7644; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7645; GFX7-NEXT: s_cbranch_execnz .LBB34_3 7646; GFX7-NEXT: ; %bb.1: ; %Flow3 7647; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7648; GFX7-NEXT: s_cbranch_execnz .LBB34_6 7649; GFX7-NEXT: .LBB34_2: ; %atomicrmw.phi 7650; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7651; GFX7-NEXT: s_setpc_b64 s[30:31] 7652; GFX7-NEXT: .LBB34_3: ; %atomicrmw.global 7653; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 7654; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 7655; GFX7-NEXT: flat_load_dword v7, v[4:5] 7656; GFX7-NEXT: flat_load_dword v6, v[0:1] 7657; GFX7-NEXT: s_mov_b64 s[6:7], 0 7658; GFX7-NEXT: .LBB34_4: ; %atomicrmw.start 7659; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7660; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7661; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7662; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7663; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7664; GFX7-NEXT: buffer_wbinvl1 7665; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7666; GFX7-NEXT: v_mov_b32_e32 v7, v5 7667; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7668; GFX7-NEXT: v_mov_b32_e32 v6, v4 7669; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 7670; GFX7-NEXT: s_cbranch_execnz .LBB34_4 7671; GFX7-NEXT: ; %bb.5: ; %Flow 7672; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 7673; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 7674; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 7675; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7676; GFX7-NEXT: s_cbranch_execz .LBB34_2 7677; GFX7-NEXT: .LBB34_6: ; %atomicrmw.private 7678; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7679; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7680; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 7681; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7682; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 7683; GFX7-NEXT: s_waitcnt vmcnt(0) 7684; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7685; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7686; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 7687; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7688; GFX7-NEXT: s_waitcnt vmcnt(0) 7689; GFX7-NEXT: s_setpc_b64 s[30:31] 7690 %gep = getelementptr double, ptr %ptr, i64 255 7691 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7692 ret void 7693} 7694 7695define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 7696; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7697; GFX12: ; %bb.0: 7698; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7699; GFX12-NEXT: s_wait_expcnt 0x0 7700; GFX12-NEXT: s_wait_samplecnt 0x0 7701; GFX12-NEXT: s_wait_bvhcnt 0x0 7702; GFX12-NEXT: s_wait_kmcnt 0x0 7703; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 7704; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7705; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 7706; GFX12-NEXT: s_mov_b32 s0, exec_lo 7707; GFX12-NEXT: s_wait_alu 0xfffe 7708; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7709; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 7710; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 7711; GFX12-NEXT: s_cbranch_execnz .LBB35_3 7712; GFX12-NEXT: ; %bb.1: ; %Flow3 7713; GFX12-NEXT: s_wait_alu 0xfffe 7714; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 7715; GFX12-NEXT: s_cbranch_execnz .LBB35_6 7716; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi 7717; GFX12-NEXT: s_wait_alu 0xfffe 7718; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7719; GFX12-NEXT: s_wait_alu 0xfffe 7720; GFX12-NEXT: s_setpc_b64 s[30:31] 7721; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global 7722; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 7723; GFX12-NEXT: s_mov_b32 s1, 0 7724; GFX12-NEXT: .LBB35_4: ; %atomicrmw.start 7725; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7726; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7727; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] 7728; GFX12-NEXT: s_wait_storecnt 0x0 7729; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7730; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7731; GFX12-NEXT: global_inv scope:SCOPE_DEV 7732; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7733; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 7734; GFX12-NEXT: s_wait_alu 0xfffe 7735; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 7736; GFX12-NEXT: s_wait_alu 0xfffe 7737; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7738; GFX12-NEXT: s_cbranch_execnz .LBB35_4 7739; GFX12-NEXT: ; %bb.5: ; %Flow 7740; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 7741; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 7742; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 7743; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 7744; GFX12-NEXT: s_cbranch_execz .LBB35_2 7745; GFX12-NEXT: .LBB35_6: ; %atomicrmw.private 7746; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7747; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7748; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 7749; GFX12-NEXT: s_wait_loadcnt 0x0 7750; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] 7751; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 7752; GFX12-NEXT: s_wait_alu 0xfffe 7753; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7754; GFX12-NEXT: s_wait_alu 0xfffe 7755; GFX12-NEXT: s_setpc_b64 s[30:31] 7756; 7757; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7758; GFX940: ; %bb.0: 7759; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7760; GFX940-NEXT: s_movk_i32 s0, 0xf800 7761; GFX940-NEXT: s_mov_b32 s1, -1 7762; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 7763; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 7764; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 7765; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 7766; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7767; GFX940-NEXT: s_cbranch_execnz .LBB35_3 7768; GFX940-NEXT: ; %bb.1: ; %Flow 7769; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 7770; GFX940-NEXT: s_cbranch_execnz .LBB35_4 7771; GFX940-NEXT: .LBB35_2: ; %atomicrmw.phi 7772; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7773; GFX940-NEXT: s_setpc_b64 s[30:31] 7774; GFX940-NEXT: .LBB35_3: ; %atomicrmw.global 7775; GFX940-NEXT: buffer_wbl2 sc1 7776; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 7777; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7778; GFX940-NEXT: buffer_inv sc1 7779; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 7780; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 7781; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 7782; GFX940-NEXT: s_cbranch_execz .LBB35_2 7783; GFX940-NEXT: .LBB35_4: ; %atomicrmw.private 7784; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7785; GFX940-NEXT: s_nop 1 7786; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7787; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 7788; GFX940-NEXT: s_waitcnt vmcnt(0) 7789; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7790; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 7791; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7792; GFX940-NEXT: s_waitcnt vmcnt(0) 7793; GFX940-NEXT: s_setpc_b64 s[30:31] 7794; 7795; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7796; GFX11: ; %bb.0: 7797; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7798; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 7799; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7800; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 7801; GFX11-NEXT: s_mov_b32 s0, exec_lo 7802; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7803; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 7804; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 7805; GFX11-NEXT: s_cbranch_execnz .LBB35_3 7806; GFX11-NEXT: ; %bb.1: ; %Flow3 7807; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 7808; GFX11-NEXT: s_cbranch_execnz .LBB35_6 7809; GFX11-NEXT: .LBB35_2: ; %atomicrmw.phi 7810; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7811; GFX11-NEXT: s_setpc_b64 s[30:31] 7812; GFX11-NEXT: .LBB35_3: ; %atomicrmw.global 7813; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 7814; GFX11-NEXT: s_mov_b32 s1, 0 7815; GFX11-NEXT: .LBB35_4: ; %atomicrmw.start 7816; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7817; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7818; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7819; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7820; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 7821; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7822; GFX11-NEXT: buffer_gl1_inv 7823; GFX11-NEXT: buffer_gl0_inv 7824; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7825; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 7826; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 7827; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7828; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7829; GFX11-NEXT: s_cbranch_execnz .LBB35_4 7830; GFX11-NEXT: ; %bb.5: ; %Flow 7831; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 7832; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 7833; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 7834; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 7835; GFX11-NEXT: s_cbranch_execz .LBB35_2 7836; GFX11-NEXT: .LBB35_6: ; %atomicrmw.private 7837; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7838; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7839; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 7840; GFX11-NEXT: s_waitcnt vmcnt(0) 7841; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7842; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 7843; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7844; GFX11-NEXT: s_setpc_b64 s[30:31] 7845; 7846; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7847; GFX10: ; %bb.0: 7848; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7849; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 7850; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7851; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 7852; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 7853; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 7854; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 7855; GFX10-NEXT: s_cbranch_execnz .LBB35_3 7856; GFX10-NEXT: ; %bb.1: ; %Flow3 7857; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7858; GFX10-NEXT: s_cbranch_execnz .LBB35_6 7859; GFX10-NEXT: .LBB35_2: ; %atomicrmw.phi 7860; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7861; GFX10-NEXT: s_setpc_b64 s[30:31] 7862; GFX10-NEXT: .LBB35_3: ; %atomicrmw.global 7863; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7864; GFX10-NEXT: s_mov_b32 s5, 0 7865; GFX10-NEXT: .LBB35_4: ; %atomicrmw.start 7866; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7867; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7868; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7869; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7870; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7871; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7872; GFX10-NEXT: buffer_gl1_inv 7873; GFX10-NEXT: buffer_gl0_inv 7874; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 7875; GFX10-NEXT: v_mov_b32_e32 v7, v5 7876; GFX10-NEXT: v_mov_b32_e32 v6, v4 7877; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 7878; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 7879; GFX10-NEXT: s_cbranch_execnz .LBB35_4 7880; GFX10-NEXT: ; %bb.5: ; %Flow 7881; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 7882; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 7883; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 7884; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 7885; GFX10-NEXT: s_cbranch_execz .LBB35_2 7886; GFX10-NEXT: .LBB35_6: ; %atomicrmw.private 7887; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 7888; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 7889; GFX10-NEXT: s_clause 0x1 7890; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7891; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7892; GFX10-NEXT: s_waitcnt vmcnt(0) 7893; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7894; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7895; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7896; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7897; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7898; GFX10-NEXT: s_setpc_b64 s[30:31] 7899; 7900; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7901; GFX90A: ; %bb.0: 7902; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7903; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 7904; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7905; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 7906; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7907; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 7908; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7909; GFX90A-NEXT: s_cbranch_execnz .LBB35_3 7910; GFX90A-NEXT: ; %bb.1: ; %Flow 7911; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7912; GFX90A-NEXT: s_cbranch_execnz .LBB35_4 7913; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.phi 7914; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7915; GFX90A-NEXT: s_setpc_b64 s[30:31] 7916; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.global 7917; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 7918; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7919; GFX90A-NEXT: buffer_wbinvl1 7920; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 7921; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 7922; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7923; GFX90A-NEXT: s_cbranch_execz .LBB35_2 7924; GFX90A-NEXT: .LBB35_4: ; %atomicrmw.private 7925; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7926; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7927; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7928; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7929; GFX90A-NEXT: s_waitcnt vmcnt(0) 7930; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7931; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7932; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7933; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7934; GFX90A-NEXT: s_waitcnt vmcnt(0) 7935; GFX90A-NEXT: s_setpc_b64 s[30:31] 7936; 7937; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7938; GFX908: ; %bb.0: 7939; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7940; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 7941; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7942; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 7943; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 7944; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 7945; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7946; GFX908-NEXT: s_cbranch_execnz .LBB35_3 7947; GFX908-NEXT: ; %bb.1: ; %Flow3 7948; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7949; GFX908-NEXT: s_cbranch_execnz .LBB35_6 7950; GFX908-NEXT: .LBB35_2: ; %atomicrmw.phi 7951; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7952; GFX908-NEXT: s_setpc_b64 s[30:31] 7953; GFX908-NEXT: .LBB35_3: ; %atomicrmw.global 7954; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 7955; GFX908-NEXT: s_mov_b64 s[6:7], 0 7956; GFX908-NEXT: .LBB35_4: ; %atomicrmw.start 7957; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7958; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7959; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 7960; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 7961; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7962; GFX908-NEXT: buffer_wbinvl1 7963; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 7964; GFX908-NEXT: v_mov_b32_e32 v7, v5 7965; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7966; GFX908-NEXT: v_mov_b32_e32 v6, v4 7967; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7968; GFX908-NEXT: s_cbranch_execnz .LBB35_4 7969; GFX908-NEXT: ; %bb.5: ; %Flow 7970; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7971; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 7972; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 7973; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 7974; GFX908-NEXT: s_cbranch_execz .LBB35_2 7975; GFX908-NEXT: .LBB35_6: ; %atomicrmw.private 7976; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 7977; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 7978; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 7979; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 7980; GFX908-NEXT: s_waitcnt vmcnt(0) 7981; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 7982; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 7983; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 7984; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7985; GFX908-NEXT: s_waitcnt vmcnt(0) 7986; GFX908-NEXT: s_setpc_b64 s[30:31] 7987; 7988; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 7989; GFX8: ; %bb.0: 7990; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7991; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 7992; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 7993; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 7994; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 7995; GFX8-NEXT: s_waitcnt lgkmcnt(0) 7996; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 7997; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 7998; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 7999; GFX8-NEXT: s_cbranch_execnz .LBB35_3 8000; GFX8-NEXT: ; %bb.1: ; %Flow3 8001; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 8002; GFX8-NEXT: s_cbranch_execnz .LBB35_6 8003; GFX8-NEXT: .LBB35_2: ; %atomicrmw.phi 8004; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8005; GFX8-NEXT: s_setpc_b64 s[30:31] 8006; GFX8-NEXT: .LBB35_3: ; %atomicrmw.global 8007; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 8008; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 8009; GFX8-NEXT: flat_load_dword v7, v[4:5] 8010; GFX8-NEXT: flat_load_dword v6, v[0:1] 8011; GFX8-NEXT: s_mov_b64 s[6:7], 0 8012; GFX8-NEXT: .LBB35_4: ; %atomicrmw.start 8013; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8014; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8015; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 8016; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 8017; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8018; GFX8-NEXT: buffer_wbinvl1 8019; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8020; GFX8-NEXT: v_mov_b32_e32 v7, v5 8021; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8022; GFX8-NEXT: v_mov_b32_e32 v6, v4 8023; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 8024; GFX8-NEXT: s_cbranch_execnz .LBB35_4 8025; GFX8-NEXT: ; %bb.5: ; %Flow 8026; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 8027; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 8028; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 8029; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 8030; GFX8-NEXT: s_cbranch_execz .LBB35_2 8031; GFX8-NEXT: .LBB35_6: ; %atomicrmw.private 8032; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 8033; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 8034; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 8035; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 8036; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 8037; GFX8-NEXT: s_waitcnt vmcnt(0) 8038; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 8039; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 8040; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 8041; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8042; GFX8-NEXT: s_waitcnt vmcnt(0) 8043; GFX8-NEXT: s_setpc_b64 s[30:31] 8044; 8045; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 8046; GFX7: ; %bb.0: 8047; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8048; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 8049; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 8050; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 8051; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 8052; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8053; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 8054; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 8055; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 8056; GFX7-NEXT: s_cbranch_execnz .LBB35_3 8057; GFX7-NEXT: ; %bb.1: ; %Flow3 8058; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 8059; GFX7-NEXT: s_cbranch_execnz .LBB35_6 8060; GFX7-NEXT: .LBB35_2: ; %atomicrmw.phi 8061; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8062; GFX7-NEXT: s_setpc_b64 s[30:31] 8063; GFX7-NEXT: .LBB35_3: ; %atomicrmw.global 8064; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 8065; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 8066; GFX7-NEXT: flat_load_dword v7, v[4:5] 8067; GFX7-NEXT: flat_load_dword v6, v[0:1] 8068; GFX7-NEXT: s_mov_b64 s[6:7], 0 8069; GFX7-NEXT: .LBB35_4: ; %atomicrmw.start 8070; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8071; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8072; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] 8073; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 8074; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8075; GFX7-NEXT: buffer_wbinvl1 8076; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 8077; GFX7-NEXT: v_mov_b32_e32 v7, v5 8078; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8079; GFX7-NEXT: v_mov_b32_e32 v6, v4 8080; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 8081; GFX7-NEXT: s_cbranch_execnz .LBB35_4 8082; GFX7-NEXT: ; %bb.5: ; %Flow 8083; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 8084; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 8085; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 8086; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 8087; GFX7-NEXT: s_cbranch_execz .LBB35_2 8088; GFX7-NEXT: .LBB35_6: ; %atomicrmw.private 8089; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 8090; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 8091; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 8092; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 8093; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 8094; GFX7-NEXT: s_waitcnt vmcnt(0) 8095; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 8096; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 8097; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 8098; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8099; GFX7-NEXT: s_waitcnt vmcnt(0) 8100; GFX7-NEXT: s_setpc_b64 s[30:31] 8101 %gep = getelementptr double, ptr %ptr, i64 -256 8102 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8103 ret void 8104} 8105 8106; -------------------------------------------------------------------- 8107; half 8108; -------------------------------------------------------------------- 8109 8110define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8111; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8112; GFX12: ; %bb.0: 8113; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8114; GFX12-NEXT: s_wait_expcnt 0x0 8115; GFX12-NEXT: s_wait_samplecnt 0x0 8116; GFX12-NEXT: s_wait_bvhcnt 0x0 8117; GFX12-NEXT: s_wait_kmcnt 0x0 8118; GFX12-NEXT: v_mov_b32_e32 v3, v0 8119; GFX12-NEXT: s_mov_b32 s0, 0 8120; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8121; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8122; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8123; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8124; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8125; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8126; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8127; GFX12-NEXT: v_not_b32_e32 v4, v4 8128; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start 8129; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8130; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8131; GFX12-NEXT: v_mov_b32_e32 v6, v5 8132; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8133; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8134; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 8135; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8136; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 8137; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8138; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8139; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8140; GFX12-NEXT: s_wait_storecnt 0x0 8141; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8142; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8143; GFX12-NEXT: global_inv scope:SCOPE_DEV 8144; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8145; GFX12-NEXT: s_wait_alu 0xfffe 8146; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8147; GFX12-NEXT: s_wait_alu 0xfffe 8148; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8149; GFX12-NEXT: s_cbranch_execnz .LBB36_1 8150; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8151; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8152; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8153; GFX12-NEXT: s_wait_alu 0xfffe 8154; GFX12-NEXT: s_setpc_b64 s[30:31] 8155; 8156; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8157; GFX940: ; %bb.0: 8158; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8159; GFX940-NEXT: v_mov_b32_e32 v3, v0 8160; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 8161; GFX940-NEXT: flat_load_dword v4, v[0:1] 8162; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 8163; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8164; GFX940-NEXT: s_mov_b32 s0, 0xffff 8165; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 8166; GFX940-NEXT: v_not_b32_e32 v5, v5 8167; GFX940-NEXT: s_mov_b64 s[0:1], 0 8168; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start 8169; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8170; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8171; GFX940-NEXT: v_mov_b32_e32 v7, v4 8172; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8173; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 8174; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8175; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 8176; GFX940-NEXT: buffer_wbl2 sc1 8177; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 8178; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8179; GFX940-NEXT: buffer_inv sc1 8180; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8181; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8182; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8183; GFX940-NEXT: s_cbranch_execnz .LBB36_1 8184; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8185; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8186; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8187; GFX940-NEXT: s_setpc_b64 s[30:31] 8188; 8189; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8190; GFX11: ; %bb.0: 8191; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8192; GFX11-NEXT: v_mov_b32_e32 v3, v0 8193; GFX11-NEXT: s_mov_b32 s0, 0 8194; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8195; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8196; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8197; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8198; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8199; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8200; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8201; GFX11-NEXT: v_not_b32_e32 v4, v4 8202; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start 8203; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8204; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8205; GFX11-NEXT: v_mov_b32_e32 v6, v5 8206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8207; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8208; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 8209; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8210; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 8211; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8212; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8213; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8214; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8215; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8216; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8217; GFX11-NEXT: buffer_gl1_inv 8218; GFX11-NEXT: buffer_gl0_inv 8219; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8220; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8221; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8222; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8223; GFX11-NEXT: s_cbranch_execnz .LBB36_1 8224; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8225; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8226; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8227; GFX11-NEXT: s_setpc_b64 s[30:31] 8228; 8229; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8230; GFX10: ; %bb.0: 8231; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8232; GFX10-NEXT: v_mov_b32_e32 v3, v0 8233; GFX10-NEXT: s_mov_b32 s4, 0 8234; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8235; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8236; GFX10-NEXT: flat_load_dword v5, v[0:1] 8237; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8238; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8239; GFX10-NEXT: v_not_b32_e32 v4, v4 8240; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start 8241; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8242; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8243; GFX10-NEXT: v_mov_b32_e32 v6, v5 8244; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8245; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 8246; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8247; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8248; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8249; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8250; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8251; GFX10-NEXT: buffer_gl1_inv 8252; GFX10-NEXT: buffer_gl0_inv 8253; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8254; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8255; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8256; GFX10-NEXT: s_cbranch_execnz .LBB36_1 8257; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8258; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8259; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8260; GFX10-NEXT: s_setpc_b64 s[30:31] 8261; 8262; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8263; GFX90A: ; %bb.0: 8264; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8265; GFX90A-NEXT: v_mov_b32_e32 v3, v0 8266; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8267; GFX90A-NEXT: flat_load_dword v4, v[0:1] 8268; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8269; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8270; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8271; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8272; GFX90A-NEXT: v_not_b32_e32 v5, v5 8273; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8274; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start 8275; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8276; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8277; GFX90A-NEXT: v_mov_b32_e32 v7, v4 8278; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8279; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 8280; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8281; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 8282; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8283; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8284; GFX90A-NEXT: buffer_wbinvl1 8285; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8286; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8287; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8288; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 8289; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8290; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8291; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8292; GFX90A-NEXT: s_setpc_b64 s[30:31] 8293; 8294; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8295; GFX908: ; %bb.0: 8296; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8297; GFX908-NEXT: v_mov_b32_e32 v3, v0 8298; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8299; GFX908-NEXT: flat_load_dword v4, v[0:1] 8300; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8301; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8302; GFX908-NEXT: s_mov_b32 s4, 0xffff 8303; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8304; GFX908-NEXT: v_not_b32_e32 v5, v5 8305; GFX908-NEXT: s_mov_b64 s[4:5], 0 8306; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start 8307; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8308; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8309; GFX908-NEXT: v_mov_b32_e32 v7, v4 8310; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8311; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 8312; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8313; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 8314; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8315; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8316; GFX908-NEXT: buffer_wbinvl1 8317; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8318; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8319; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8320; GFX908-NEXT: s_cbranch_execnz .LBB36_1 8321; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8322; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8323; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8324; GFX908-NEXT: s_setpc_b64 s[30:31] 8325; 8326; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8327; GFX8: ; %bb.0: 8328; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8329; GFX8-NEXT: v_mov_b32_e32 v3, v0 8330; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8331; GFX8-NEXT: flat_load_dword v5, v[0:1] 8332; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8333; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8334; GFX8-NEXT: s_mov_b32 s4, 0xffff 8335; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8336; GFX8-NEXT: v_not_b32_e32 v4, v4 8337; GFX8-NEXT: s_mov_b64 s[4:5], 0 8338; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start 8339; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8340; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8341; GFX8-NEXT: v_mov_b32_e32 v6, v5 8342; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8343; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 8344; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8345; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8346; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8347; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8348; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8349; GFX8-NEXT: buffer_wbinvl1 8350; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8351; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8352; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8353; GFX8-NEXT: s_cbranch_execnz .LBB36_1 8354; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8355; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8356; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8357; GFX8-NEXT: s_setpc_b64 s[30:31] 8358; 8359; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: 8360; GFX7: ; %bb.0: 8361; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8362; GFX7-NEXT: v_mov_b32_e32 v3, v0 8363; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8364; GFX7-NEXT: flat_load_dword v5, v[0:1] 8365; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 8366; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 8367; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 8368; GFX7-NEXT: s_mov_b64 s[4:5], 0 8369; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8370; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 8371; GFX7-NEXT: v_not_b32_e32 v4, v4 8372; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start 8373; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8374; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8375; GFX7-NEXT: v_mov_b32_e32 v6, v5 8376; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 8377; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 8378; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8379; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 8380; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 8381; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 8382; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8383; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8384; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8385; GFX7-NEXT: buffer_wbinvl1 8386; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8387; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8388; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8389; GFX7-NEXT: s_cbranch_execnz .LBB36_1 8390; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8391; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8392; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 8393; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 8394; GFX7-NEXT: s_setpc_b64 s[30:31] 8395 %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8396 ret half %result 8397} 8398 8399define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8400; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8401; GFX12: ; %bb.0: 8402; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8403; GFX12-NEXT: s_wait_expcnt 0x0 8404; GFX12-NEXT: s_wait_samplecnt 0x0 8405; GFX12-NEXT: s_wait_bvhcnt 0x0 8406; GFX12-NEXT: s_wait_kmcnt 0x0 8407; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8408; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8409; GFX12-NEXT: s_mov_b32 s0, 0 8410; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8411; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8412; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8413; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8414; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8415; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8416; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8417; GFX12-NEXT: v_not_b32_e32 v4, v4 8418; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start 8419; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8420; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8421; GFX12-NEXT: v_mov_b32_e32 v6, v5 8422; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8423; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8424; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 8425; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8426; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 8427; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8428; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8429; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8430; GFX12-NEXT: s_wait_storecnt 0x0 8431; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8432; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8433; GFX12-NEXT: global_inv scope:SCOPE_DEV 8434; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8435; GFX12-NEXT: s_wait_alu 0xfffe 8436; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8437; GFX12-NEXT: s_wait_alu 0xfffe 8438; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8439; GFX12-NEXT: s_cbranch_execnz .LBB37_1 8440; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8441; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8442; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8443; GFX12-NEXT: s_wait_alu 0xfffe 8444; GFX12-NEXT: s_setpc_b64 s[30:31] 8445; 8446; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8447; GFX940: ; %bb.0: 8448; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8449; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 8450; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 8451; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 8452; GFX940-NEXT: v_mov_b32_e32 v1, v7 8453; GFX940-NEXT: flat_load_dword v4, v[0:1] 8454; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 8455; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8456; GFX940-NEXT: s_mov_b32 s0, 0xffff 8457; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 8458; GFX940-NEXT: v_not_b32_e32 v5, v5 8459; GFX940-NEXT: s_mov_b64 s[0:1], 0 8460; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start 8461; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8462; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8463; GFX940-NEXT: v_mov_b32_e32 v7, v4 8464; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8465; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 8466; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8467; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 8468; GFX940-NEXT: buffer_wbl2 sc1 8469; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 8470; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8471; GFX940-NEXT: buffer_inv sc1 8472; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8473; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8474; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8475; GFX940-NEXT: s_cbranch_execnz .LBB37_1 8476; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8477; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8478; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8479; GFX940-NEXT: s_setpc_b64 s[30:31] 8480; 8481; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8482; GFX11: ; %bb.0: 8483; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8484; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8485; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8486; GFX11-NEXT: s_mov_b32 s0, 0 8487; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8488; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8489; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8490; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8491; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8492; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8493; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8494; GFX11-NEXT: v_not_b32_e32 v4, v4 8495; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start 8496; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8497; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8498; GFX11-NEXT: v_mov_b32_e32 v6, v5 8499; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8500; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8501; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 8502; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8503; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 8504; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8505; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8506; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8507; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8508; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8509; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8510; GFX11-NEXT: buffer_gl1_inv 8511; GFX11-NEXT: buffer_gl0_inv 8512; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8513; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8514; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8515; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8516; GFX11-NEXT: s_cbranch_execnz .LBB37_1 8517; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8518; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8519; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8520; GFX11-NEXT: s_setpc_b64 s[30:31] 8521; 8522; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8523; GFX10: ; %bb.0: 8524; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8525; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8526; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8527; GFX10-NEXT: s_mov_b32 s4, 0 8528; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8529; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8530; GFX10-NEXT: flat_load_dword v5, v[0:1] 8531; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8532; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8533; GFX10-NEXT: v_not_b32_e32 v4, v4 8534; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start 8535; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8536; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8537; GFX10-NEXT: v_mov_b32_e32 v6, v5 8538; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8539; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 8540; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8541; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8542; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8543; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8544; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8545; GFX10-NEXT: buffer_gl1_inv 8546; GFX10-NEXT: buffer_gl0_inv 8547; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8548; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8549; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8550; GFX10-NEXT: s_cbranch_execnz .LBB37_1 8551; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8552; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8553; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8554; GFX10-NEXT: s_setpc_b64 s[30:31] 8555; 8556; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8557; GFX90A: ; %bb.0: 8558; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8559; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8560; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8561; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8562; GFX90A-NEXT: flat_load_dword v4, v[0:1] 8563; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8564; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8565; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8566; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8567; GFX90A-NEXT: v_not_b32_e32 v5, v5 8568; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8569; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start 8570; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8571; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8572; GFX90A-NEXT: v_mov_b32_e32 v7, v4 8573; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8574; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 8575; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8576; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 8577; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8578; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8579; GFX90A-NEXT: buffer_wbinvl1 8580; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8581; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8582; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8583; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 8584; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8585; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8586; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8587; GFX90A-NEXT: s_setpc_b64 s[30:31] 8588; 8589; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8590; GFX908: ; %bb.0: 8591; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8592; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8593; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8594; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8595; GFX908-NEXT: flat_load_dword v4, v[0:1] 8596; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8597; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8598; GFX908-NEXT: s_mov_b32 s4, 0xffff 8599; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8600; GFX908-NEXT: v_not_b32_e32 v5, v5 8601; GFX908-NEXT: s_mov_b64 s[4:5], 0 8602; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start 8603; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8604; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8605; GFX908-NEXT: v_mov_b32_e32 v7, v4 8606; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8607; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 8608; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8609; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 8610; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8611; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8612; GFX908-NEXT: buffer_wbinvl1 8613; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8614; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8615; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8616; GFX908-NEXT: s_cbranch_execnz .LBB37_1 8617; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8618; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8619; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8620; GFX908-NEXT: s_setpc_b64 s[30:31] 8621; 8622; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8623; GFX8: ; %bb.0: 8624; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8625; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 8626; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8627; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8628; GFX8-NEXT: flat_load_dword v5, v[0:1] 8629; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8630; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8631; GFX8-NEXT: s_mov_b32 s4, 0xffff 8632; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8633; GFX8-NEXT: v_not_b32_e32 v4, v4 8634; GFX8-NEXT: s_mov_b64 s[4:5], 0 8635; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start 8636; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8637; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8638; GFX8-NEXT: v_mov_b32_e32 v6, v5 8639; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8640; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 8641; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8642; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8643; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8644; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8645; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8646; GFX8-NEXT: buffer_wbinvl1 8647; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8648; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8649; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8650; GFX8-NEXT: s_cbranch_execnz .LBB37_1 8651; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8652; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8653; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8654; GFX8-NEXT: s_setpc_b64 s[30:31] 8655; 8656; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8657; GFX7: ; %bb.0: 8658; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8659; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 8660; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8661; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8662; GFX7-NEXT: flat_load_dword v5, v[0:1] 8663; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 8664; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 8665; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 8666; GFX7-NEXT: s_mov_b64 s[4:5], 0 8667; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8668; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 8669; GFX7-NEXT: v_not_b32_e32 v4, v4 8670; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start 8671; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8672; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8673; GFX7-NEXT: v_mov_b32_e32 v6, v5 8674; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 8675; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 8676; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8677; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 8678; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 8679; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 8680; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8681; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8682; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8683; GFX7-NEXT: buffer_wbinvl1 8684; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8685; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8686; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8687; GFX7-NEXT: s_cbranch_execnz .LBB37_1 8688; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8689; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8690; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 8691; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 8692; GFX7-NEXT: s_setpc_b64 s[30:31] 8693 %gep = getelementptr half, ptr %ptr, i64 1023 8694 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8695 ret half %result 8696} 8697 8698define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8699; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8700; GFX12: ; %bb.0: 8701; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8702; GFX12-NEXT: s_wait_expcnt 0x0 8703; GFX12-NEXT: s_wait_samplecnt 0x0 8704; GFX12-NEXT: s_wait_bvhcnt 0x0 8705; GFX12-NEXT: s_wait_kmcnt 0x0 8706; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 8707; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 8708; GFX12-NEXT: s_mov_b32 s0, 0 8709; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8710; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8711; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8712; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8713; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8714; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8715; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8716; GFX12-NEXT: v_not_b32_e32 v4, v4 8717; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start 8718; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8719; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8720; GFX12-NEXT: v_mov_b32_e32 v6, v5 8721; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8722; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8723; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 8724; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8725; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 8726; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8727; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8728; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8729; GFX12-NEXT: s_wait_storecnt 0x0 8730; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8731; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8732; GFX12-NEXT: global_inv scope:SCOPE_DEV 8733; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8734; GFX12-NEXT: s_wait_alu 0xfffe 8735; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8736; GFX12-NEXT: s_wait_alu 0xfffe 8737; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8738; GFX12-NEXT: s_cbranch_execnz .LBB38_1 8739; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8740; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8741; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8742; GFX12-NEXT: s_wait_alu 0xfffe 8743; GFX12-NEXT: s_setpc_b64 s[30:31] 8744; 8745; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8746; GFX940: ; %bb.0: 8747; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8748; GFX940-NEXT: s_movk_i32 s0, 0xf800 8749; GFX940-NEXT: s_mov_b32 s1, -1 8750; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 8751; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 8752; GFX940-NEXT: v_mov_b32_e32 v1, v7 8753; GFX940-NEXT: flat_load_dword v4, v[0:1] 8754; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 8755; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8756; GFX940-NEXT: s_mov_b32 s0, 0xffff 8757; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 8758; GFX940-NEXT: v_not_b32_e32 v5, v5 8759; GFX940-NEXT: s_mov_b64 s[0:1], 0 8760; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start 8761; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8762; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8763; GFX940-NEXT: v_mov_b32_e32 v7, v4 8764; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8765; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 8766; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8767; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 8768; GFX940-NEXT: buffer_wbl2 sc1 8769; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 8770; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8771; GFX940-NEXT: buffer_inv sc1 8772; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8773; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8774; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8775; GFX940-NEXT: s_cbranch_execnz .LBB38_1 8776; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8777; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8778; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8779; GFX940-NEXT: s_setpc_b64 s[30:31] 8780; 8781; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8782; GFX11: ; %bb.0: 8783; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8784; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 8785; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 8786; GFX11-NEXT: s_mov_b32 s0, 0 8787; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8788; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8789; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8790; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8791; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8792; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8793; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8794; GFX11-NEXT: v_not_b32_e32 v4, v4 8795; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start 8796; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8797; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8798; GFX11-NEXT: v_mov_b32_e32 v6, v5 8799; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8800; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8801; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 8802; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8803; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 8804; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8806; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8807; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8808; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8809; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8810; GFX11-NEXT: buffer_gl1_inv 8811; GFX11-NEXT: buffer_gl0_inv 8812; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8813; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8814; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8815; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8816; GFX11-NEXT: s_cbranch_execnz .LBB38_1 8817; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8818; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8819; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8820; GFX11-NEXT: s_setpc_b64 s[30:31] 8821; 8822; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8823; GFX10: ; %bb.0: 8824; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8825; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 8826; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 8827; GFX10-NEXT: s_mov_b32 s4, 0 8828; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8829; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8830; GFX10-NEXT: flat_load_dword v5, v[0:1] 8831; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8832; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8833; GFX10-NEXT: v_not_b32_e32 v4, v4 8834; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start 8835; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8836; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8837; GFX10-NEXT: v_mov_b32_e32 v6, v5 8838; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8839; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 8840; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8841; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8842; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8843; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8844; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8845; GFX10-NEXT: buffer_gl1_inv 8846; GFX10-NEXT: buffer_gl0_inv 8847; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8848; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8849; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8850; GFX10-NEXT: s_cbranch_execnz .LBB38_1 8851; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8852; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8853; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8854; GFX10-NEXT: s_setpc_b64 s[30:31] 8855; 8856; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8857; GFX90A: ; %bb.0: 8858; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8859; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 8860; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 8861; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8862; GFX90A-NEXT: flat_load_dword v4, v[0:1] 8863; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8864; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8865; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8866; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8867; GFX90A-NEXT: v_not_b32_e32 v5, v5 8868; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8869; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start 8870; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8871; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8872; GFX90A-NEXT: v_mov_b32_e32 v7, v4 8873; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8874; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 8875; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8876; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 8877; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8878; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8879; GFX90A-NEXT: buffer_wbinvl1 8880; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8881; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8882; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8883; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 8884; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8885; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8886; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8887; GFX90A-NEXT: s_setpc_b64 s[30:31] 8888; 8889; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8890; GFX908: ; %bb.0: 8891; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8892; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 8893; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 8894; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8895; GFX908-NEXT: flat_load_dword v4, v[0:1] 8896; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8897; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8898; GFX908-NEXT: s_mov_b32 s4, 0xffff 8899; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8900; GFX908-NEXT: v_not_b32_e32 v5, v5 8901; GFX908-NEXT: s_mov_b64 s[4:5], 0 8902; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start 8903; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8904; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8905; GFX908-NEXT: v_mov_b32_e32 v7, v4 8906; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8907; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 8908; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8909; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 8910; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8911; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8912; GFX908-NEXT: buffer_wbinvl1 8913; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8914; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8915; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8916; GFX908-NEXT: s_cbranch_execnz .LBB38_1 8917; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8918; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8919; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8920; GFX908-NEXT: s_setpc_b64 s[30:31] 8921; 8922; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8923; GFX8: ; %bb.0: 8924; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8925; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 8926; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 8927; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8928; GFX8-NEXT: flat_load_dword v5, v[0:1] 8929; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8930; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8931; GFX8-NEXT: s_mov_b32 s4, 0xffff 8932; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8933; GFX8-NEXT: v_not_b32_e32 v4, v4 8934; GFX8-NEXT: s_mov_b64 s[4:5], 0 8935; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start 8936; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8937; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8938; GFX8-NEXT: v_mov_b32_e32 v6, v5 8939; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8940; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 8941; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8942; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8943; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8944; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8945; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8946; GFX8-NEXT: buffer_wbinvl1 8947; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8948; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8949; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8950; GFX8-NEXT: s_cbranch_execnz .LBB38_1 8951; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8952; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8953; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8954; GFX8-NEXT: s_setpc_b64 s[30:31] 8955; 8956; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 8957; GFX7: ; %bb.0: 8958; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8959; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 8960; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 8961; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8962; GFX7-NEXT: flat_load_dword v5, v[0:1] 8963; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 8964; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 8965; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 8966; GFX7-NEXT: s_mov_b64 s[4:5], 0 8967; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8968; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 8969; GFX7-NEXT: v_not_b32_e32 v4, v4 8970; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start 8971; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8972; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8973; GFX7-NEXT: v_mov_b32_e32 v6, v5 8974; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 8975; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 8976; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8977; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 8978; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 8979; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 8980; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8981; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8982; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8983; GFX7-NEXT: buffer_wbinvl1 8984; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8985; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8986; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8987; GFX7-NEXT: s_cbranch_execnz .LBB38_1 8988; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8989; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8990; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 8991; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 8992; GFX7-NEXT: s_setpc_b64 s[30:31] 8993 %gep = getelementptr half, ptr %ptr, i64 -1024 8994 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8995 ret half %result 8996 } 8997 8998define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8999; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9000; GFX12: ; %bb.0: 9001; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9002; GFX12-NEXT: s_wait_expcnt 0x0 9003; GFX12-NEXT: s_wait_samplecnt 0x0 9004; GFX12-NEXT: s_wait_bvhcnt 0x0 9005; GFX12-NEXT: s_wait_kmcnt 0x0 9006; GFX12-NEXT: v_mov_b32_e32 v3, v0 9007; GFX12-NEXT: s_mov_b32 s0, 0 9008; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9009; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9010; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9011; GFX12-NEXT: flat_load_b32 v4, v[0:1] 9012; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9013; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9014; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9015; GFX12-NEXT: v_not_b32_e32 v6, v3 9016; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start 9017; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9018; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9019; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9020; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9021; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 9022; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 9023; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9024; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9025; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 9026; GFX12-NEXT: s_wait_storecnt 0x0 9027; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9028; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9029; GFX12-NEXT: global_inv scope:SCOPE_DEV 9030; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9031; GFX12-NEXT: v_mov_b32_e32 v4, v3 9032; GFX12-NEXT: s_wait_alu 0xfffe 9033; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9034; GFX12-NEXT: s_wait_alu 0xfffe 9035; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9036; GFX12-NEXT: s_cbranch_execnz .LBB39_1 9037; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9038; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9039; GFX12-NEXT: s_wait_alu 0xfffe 9040; GFX12-NEXT: s_setpc_b64 s[30:31] 9041; 9042; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9043; GFX940: ; %bb.0: 9044; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9045; GFX940-NEXT: v_mov_b32_e32 v3, v0 9046; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 9047; GFX940-NEXT: flat_load_dword v5, v[0:1] 9048; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 9049; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9050; GFX940-NEXT: s_mov_b32 s0, 0xffff 9051; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9052; GFX940-NEXT: v_not_b32_e32 v6, v4 9053; GFX940-NEXT: s_mov_b64 s[0:1], 0 9054; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start 9055; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9056; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9057; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9058; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 9059; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9060; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 9061; GFX940-NEXT: buffer_wbl2 sc1 9062; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 9063; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9064; GFX940-NEXT: buffer_inv sc1 9065; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9066; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9067; GFX940-NEXT: v_mov_b32_e32 v5, v4 9068; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9069; GFX940-NEXT: s_cbranch_execnz .LBB39_1 9070; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9071; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9072; GFX940-NEXT: s_setpc_b64 s[30:31] 9073; 9074; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9075; GFX11: ; %bb.0: 9076; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9077; GFX11-NEXT: v_mov_b32_e32 v3, v0 9078; GFX11-NEXT: s_mov_b32 s0, 0 9079; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9080; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9081; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9082; GFX11-NEXT: flat_load_b32 v4, v[0:1] 9083; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9084; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9086; GFX11-NEXT: v_not_b32_e32 v6, v3 9087; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start 9088; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9089; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9090; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9091; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9092; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 9093; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 9094; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9095; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9096; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 9097; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9098; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 9099; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9100; GFX11-NEXT: buffer_gl1_inv 9101; GFX11-NEXT: buffer_gl0_inv 9102; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9103; GFX11-NEXT: v_mov_b32_e32 v4, v3 9104; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9105; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9106; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9107; GFX11-NEXT: s_cbranch_execnz .LBB39_1 9108; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9109; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9110; GFX11-NEXT: s_setpc_b64 s[30:31] 9111; 9112; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9113; GFX10: ; %bb.0: 9114; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9115; GFX10-NEXT: v_mov_b32_e32 v3, v0 9116; GFX10-NEXT: s_mov_b32 s4, 0 9117; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9118; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9119; GFX10-NEXT: flat_load_dword v4, v[0:1] 9120; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9121; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9122; GFX10-NEXT: v_not_b32_e32 v6, v3 9123; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start 9124; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9125; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9126; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9127; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 9128; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 9129; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 9130; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9131; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9132; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9133; GFX10-NEXT: buffer_gl1_inv 9134; GFX10-NEXT: buffer_gl0_inv 9135; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9136; GFX10-NEXT: v_mov_b32_e32 v4, v3 9137; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9138; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9139; GFX10-NEXT: s_cbranch_execnz .LBB39_1 9140; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9141; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9142; GFX10-NEXT: s_setpc_b64 s[30:31] 9143; 9144; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9145; GFX90A: ; %bb.0: 9146; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9147; GFX90A-NEXT: v_mov_b32_e32 v3, v0 9148; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9149; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9150; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9151; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9152; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9153; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9154; GFX90A-NEXT: v_not_b32_e32 v6, v4 9155; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9156; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start 9157; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9158; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9159; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9160; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 9161; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9162; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 9163; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 9164; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9165; GFX90A-NEXT: buffer_wbinvl1 9166; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9167; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9168; GFX90A-NEXT: v_mov_b32_e32 v5, v4 9169; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9170; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 9171; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9172; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9173; GFX90A-NEXT: s_setpc_b64 s[30:31] 9174; 9175; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9176; GFX908: ; %bb.0: 9177; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9178; GFX908-NEXT: v_mov_b32_e32 v3, v0 9179; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9180; GFX908-NEXT: flat_load_dword v4, v[0:1] 9181; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9182; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9183; GFX908-NEXT: s_mov_b32 s4, 0xffff 9184; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9185; GFX908-NEXT: v_not_b32_e32 v6, v3 9186; GFX908-NEXT: s_mov_b64 s[4:5], 0 9187; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start 9188; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9189; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9190; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9191; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 9192; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9193; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 9194; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9195; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9196; GFX908-NEXT: buffer_wbinvl1 9197; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9198; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9199; GFX908-NEXT: v_mov_b32_e32 v4, v3 9200; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9201; GFX908-NEXT: s_cbranch_execnz .LBB39_1 9202; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9203; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9204; GFX908-NEXT: s_setpc_b64 s[30:31] 9205; 9206; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9207; GFX8: ; %bb.0: 9208; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9209; GFX8-NEXT: v_mov_b32_e32 v3, v0 9210; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9211; GFX8-NEXT: flat_load_dword v4, v[0:1] 9212; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9213; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9214; GFX8-NEXT: s_mov_b32 s4, 0xffff 9215; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9216; GFX8-NEXT: v_not_b32_e32 v6, v3 9217; GFX8-NEXT: s_mov_b64 s[4:5], 0 9218; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start 9219; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9220; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9221; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9222; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 9223; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 9224; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9225; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 9226; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9227; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9228; GFX8-NEXT: buffer_wbinvl1 9229; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9230; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9231; GFX8-NEXT: v_mov_b32_e32 v4, v3 9232; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9233; GFX8-NEXT: s_cbranch_execnz .LBB39_1 9234; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9235; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9236; GFX8-NEXT: s_setpc_b64 s[30:31] 9237; 9238; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: 9239; GFX7: ; %bb.0: 9240; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9241; GFX7-NEXT: v_mov_b32_e32 v3, v0 9242; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9243; GFX7-NEXT: flat_load_dword v4, v[0:1] 9244; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 9245; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 9246; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 9247; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 9248; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 9249; GFX7-NEXT: v_not_b32_e32 v6, v3 9250; GFX7-NEXT: s_mov_b64 s[4:5], 0 9251; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start 9252; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9253; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9254; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 9255; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 9256; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 9257; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 9258; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 9259; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 9260; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 9261; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9262; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9263; GFX7-NEXT: buffer_wbinvl1 9264; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9265; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9266; GFX7-NEXT: v_mov_b32_e32 v4, v3 9267; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9268; GFX7-NEXT: s_cbranch_execnz .LBB39_1 9269; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9270; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9271; GFX7-NEXT: s_setpc_b64 s[30:31] 9272 %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9273 ret void 9274} 9275 9276define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 9277; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9278; GFX12: ; %bb.0: 9279; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9280; GFX12-NEXT: s_wait_expcnt 0x0 9281; GFX12-NEXT: s_wait_samplecnt 0x0 9282; GFX12-NEXT: s_wait_bvhcnt 0x0 9283; GFX12-NEXT: s_wait_kmcnt 0x0 9284; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9285; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9286; GFX12-NEXT: s_mov_b32 s0, 0 9287; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9288; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9289; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9290; GFX12-NEXT: flat_load_b32 v4, v[0:1] 9291; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9292; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9293; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9294; GFX12-NEXT: v_not_b32_e32 v6, v3 9295; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start 9296; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9297; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9298; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9299; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9300; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 9301; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 9302; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9303; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9304; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 9305; GFX12-NEXT: s_wait_storecnt 0x0 9306; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9307; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9308; GFX12-NEXT: global_inv scope:SCOPE_DEV 9309; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9310; GFX12-NEXT: v_mov_b32_e32 v4, v3 9311; GFX12-NEXT: s_wait_alu 0xfffe 9312; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9313; GFX12-NEXT: s_wait_alu 0xfffe 9314; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9315; GFX12-NEXT: s_cbranch_execnz .LBB40_1 9316; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9317; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9318; GFX12-NEXT: s_wait_alu 0xfffe 9319; GFX12-NEXT: s_setpc_b64 s[30:31] 9320; 9321; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9322; GFX940: ; %bb.0: 9323; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9324; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 9325; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9326; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9327; GFX940-NEXT: v_mov_b32_e32 v1, v5 9328; GFX940-NEXT: flat_load_dword v5, v[0:1] 9329; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9330; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9331; GFX940-NEXT: s_mov_b32 s0, 0xffff 9332; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9333; GFX940-NEXT: v_not_b32_e32 v6, v4 9334; GFX940-NEXT: s_mov_b64 s[0:1], 0 9335; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start 9336; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9337; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9338; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9339; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 9340; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9341; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 9342; GFX940-NEXT: buffer_wbl2 sc1 9343; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 9344; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9345; GFX940-NEXT: buffer_inv sc1 9346; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9347; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9348; GFX940-NEXT: v_mov_b32_e32 v5, v4 9349; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9350; GFX940-NEXT: s_cbranch_execnz .LBB40_1 9351; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9352; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9353; GFX940-NEXT: s_setpc_b64 s[30:31] 9354; 9355; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9356; GFX11: ; %bb.0: 9357; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9358; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9359; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9360; GFX11-NEXT: s_mov_b32 s0, 0 9361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9362; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9363; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9364; GFX11-NEXT: flat_load_b32 v4, v[0:1] 9365; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9366; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9367; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9368; GFX11-NEXT: v_not_b32_e32 v6, v3 9369; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start 9370; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9371; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9372; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9373; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9374; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 9375; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 9376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9377; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9378; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 9379; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9380; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 9381; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9382; GFX11-NEXT: buffer_gl1_inv 9383; GFX11-NEXT: buffer_gl0_inv 9384; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9385; GFX11-NEXT: v_mov_b32_e32 v4, v3 9386; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9387; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9388; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9389; GFX11-NEXT: s_cbranch_execnz .LBB40_1 9390; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9391; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9392; GFX11-NEXT: s_setpc_b64 s[30:31] 9393; 9394; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9395; GFX10: ; %bb.0: 9396; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9397; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9398; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9399; GFX10-NEXT: s_mov_b32 s4, 0 9400; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9401; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9402; GFX10-NEXT: flat_load_dword v4, v[0:1] 9403; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9404; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9405; GFX10-NEXT: v_not_b32_e32 v6, v3 9406; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start 9407; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9408; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9409; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9410; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 9411; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 9412; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 9413; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9414; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9415; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9416; GFX10-NEXT: buffer_gl1_inv 9417; GFX10-NEXT: buffer_gl0_inv 9418; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9419; GFX10-NEXT: v_mov_b32_e32 v4, v3 9420; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9421; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9422; GFX10-NEXT: s_cbranch_execnz .LBB40_1 9423; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9424; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9425; GFX10-NEXT: s_setpc_b64 s[30:31] 9426; 9427; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9428; GFX90A: ; %bb.0: 9429; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9430; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9431; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9432; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9433; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9434; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9435; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9436; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9437; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9438; GFX90A-NEXT: v_not_b32_e32 v6, v4 9439; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9440; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start 9441; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9442; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9443; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9444; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 9445; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9446; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 9447; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 9448; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9449; GFX90A-NEXT: buffer_wbinvl1 9450; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9451; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9452; GFX90A-NEXT: v_mov_b32_e32 v5, v4 9453; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9454; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 9455; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9456; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9457; GFX90A-NEXT: s_setpc_b64 s[30:31] 9458; 9459; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9460; GFX908: ; %bb.0: 9461; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9462; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9463; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9464; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9465; GFX908-NEXT: flat_load_dword v4, v[0:1] 9466; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9467; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9468; GFX908-NEXT: s_mov_b32 s4, 0xffff 9469; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9470; GFX908-NEXT: v_not_b32_e32 v6, v3 9471; GFX908-NEXT: s_mov_b64 s[4:5], 0 9472; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start 9473; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9474; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9475; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9476; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 9477; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9478; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 9479; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9480; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9481; GFX908-NEXT: buffer_wbinvl1 9482; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9483; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9484; GFX908-NEXT: v_mov_b32_e32 v4, v3 9485; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9486; GFX908-NEXT: s_cbranch_execnz .LBB40_1 9487; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9488; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9489; GFX908-NEXT: s_setpc_b64 s[30:31] 9490; 9491; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9492; GFX8: ; %bb.0: 9493; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9494; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 9495; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9496; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9497; GFX8-NEXT: flat_load_dword v4, v[0:1] 9498; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9499; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9500; GFX8-NEXT: s_mov_b32 s4, 0xffff 9501; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9502; GFX8-NEXT: v_not_b32_e32 v6, v3 9503; GFX8-NEXT: s_mov_b64 s[4:5], 0 9504; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start 9505; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9506; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9507; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9508; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 9509; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 9510; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9511; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 9512; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9513; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9514; GFX8-NEXT: buffer_wbinvl1 9515; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9516; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9517; GFX8-NEXT: v_mov_b32_e32 v4, v3 9518; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9519; GFX8-NEXT: s_cbranch_execnz .LBB40_1 9520; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9521; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9522; GFX8-NEXT: s_setpc_b64 s[30:31] 9523; 9524; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 9525; GFX7: ; %bb.0: 9526; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9527; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 9528; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9529; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 9530; GFX7-NEXT: flat_load_dword v3, v[0:1] 9531; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 9532; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 9533; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 9534; GFX7-NEXT: s_mov_b64 s[4:5], 0 9535; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 9536; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 9537; GFX7-NEXT: v_not_b32_e32 v6, v2 9538; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start 9539; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9540; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9541; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 9542; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 9543; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 9544; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 9545; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 9546; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 9547; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 9548; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9549; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9550; GFX7-NEXT: buffer_wbinvl1 9551; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 9552; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9553; GFX7-NEXT: v_mov_b32_e32 v3, v2 9554; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9555; GFX7-NEXT: s_cbranch_execnz .LBB40_1 9556; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9557; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9558; GFX7-NEXT: s_setpc_b64 s[30:31] 9559 %gep = getelementptr half, ptr %ptr, i64 1023 9560 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9561 ret void 9562} 9563 9564define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 9565; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9566; GFX12: ; %bb.0: 9567; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9568; GFX12-NEXT: s_wait_expcnt 0x0 9569; GFX12-NEXT: s_wait_samplecnt 0x0 9570; GFX12-NEXT: s_wait_bvhcnt 0x0 9571; GFX12-NEXT: s_wait_kmcnt 0x0 9572; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9573; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9574; GFX12-NEXT: s_mov_b32 s0, 0 9575; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9576; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9577; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9578; GFX12-NEXT: flat_load_b32 v4, v[0:1] 9579; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9580; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9581; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9582; GFX12-NEXT: v_not_b32_e32 v6, v3 9583; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start 9584; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9585; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9586; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9587; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9588; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 9589; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 9590; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9591; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9592; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 9593; GFX12-NEXT: s_wait_storecnt 0x0 9594; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9595; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9596; GFX12-NEXT: global_inv scope:SCOPE_DEV 9597; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9598; GFX12-NEXT: v_mov_b32_e32 v4, v3 9599; GFX12-NEXT: s_wait_alu 0xfffe 9600; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9601; GFX12-NEXT: s_wait_alu 0xfffe 9602; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9603; GFX12-NEXT: s_cbranch_execnz .LBB41_1 9604; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9605; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9606; GFX12-NEXT: s_wait_alu 0xfffe 9607; GFX12-NEXT: s_setpc_b64 s[30:31] 9608; 9609; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9610; GFX940: ; %bb.0: 9611; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9612; GFX940-NEXT: s_movk_i32 s0, 0xf800 9613; GFX940-NEXT: s_mov_b32 s1, -1 9614; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9615; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9616; GFX940-NEXT: v_mov_b32_e32 v1, v5 9617; GFX940-NEXT: flat_load_dword v5, v[0:1] 9618; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9619; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9620; GFX940-NEXT: s_mov_b32 s0, 0xffff 9621; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9622; GFX940-NEXT: v_not_b32_e32 v6, v4 9623; GFX940-NEXT: s_mov_b64 s[0:1], 0 9624; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start 9625; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9626; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9627; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9628; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 9629; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9630; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 9631; GFX940-NEXT: buffer_wbl2 sc1 9632; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 9633; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9634; GFX940-NEXT: buffer_inv sc1 9635; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9636; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9637; GFX940-NEXT: v_mov_b32_e32 v5, v4 9638; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9639; GFX940-NEXT: s_cbranch_execnz .LBB41_1 9640; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9641; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9642; GFX940-NEXT: s_setpc_b64 s[30:31] 9643; 9644; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9645; GFX11: ; %bb.0: 9646; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9647; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9648; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9649; GFX11-NEXT: s_mov_b32 s0, 0 9650; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9651; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9652; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9653; GFX11-NEXT: flat_load_b32 v4, v[0:1] 9654; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9655; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9656; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9657; GFX11-NEXT: v_not_b32_e32 v6, v3 9658; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start 9659; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9660; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9661; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9662; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9663; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 9664; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 9665; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9666; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9667; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 9668; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9669; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 9670; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9671; GFX11-NEXT: buffer_gl1_inv 9672; GFX11-NEXT: buffer_gl0_inv 9673; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9674; GFX11-NEXT: v_mov_b32_e32 v4, v3 9675; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9676; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9677; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9678; GFX11-NEXT: s_cbranch_execnz .LBB41_1 9679; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9680; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9681; GFX11-NEXT: s_setpc_b64 s[30:31] 9682; 9683; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9684; GFX10: ; %bb.0: 9685; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9686; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9687; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9688; GFX10-NEXT: s_mov_b32 s4, 0 9689; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9690; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9691; GFX10-NEXT: flat_load_dword v4, v[0:1] 9692; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9693; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9694; GFX10-NEXT: v_not_b32_e32 v6, v3 9695; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start 9696; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9697; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9698; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9699; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 9700; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 9701; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 9702; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9703; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9704; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9705; GFX10-NEXT: buffer_gl1_inv 9706; GFX10-NEXT: buffer_gl0_inv 9707; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9708; GFX10-NEXT: v_mov_b32_e32 v4, v3 9709; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9710; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9711; GFX10-NEXT: s_cbranch_execnz .LBB41_1 9712; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9713; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9714; GFX10-NEXT: s_setpc_b64 s[30:31] 9715; 9716; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9717; GFX90A: ; %bb.0: 9718; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9719; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9720; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9721; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9722; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9723; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9724; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9725; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9726; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9727; GFX90A-NEXT: v_not_b32_e32 v6, v4 9728; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9729; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start 9730; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9731; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9732; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 9733; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 9734; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 9735; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 9736; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 9737; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9738; GFX90A-NEXT: buffer_wbinvl1 9739; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9740; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9741; GFX90A-NEXT: v_mov_b32_e32 v5, v4 9742; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9743; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 9744; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9745; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9746; GFX90A-NEXT: s_setpc_b64 s[30:31] 9747; 9748; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9749; GFX908: ; %bb.0: 9750; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9751; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9752; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9753; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9754; GFX908-NEXT: flat_load_dword v4, v[0:1] 9755; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9756; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9757; GFX908-NEXT: s_mov_b32 s4, 0xffff 9758; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9759; GFX908-NEXT: v_not_b32_e32 v6, v3 9760; GFX908-NEXT: s_mov_b64 s[4:5], 0 9761; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start 9762; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9763; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9764; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9765; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 9766; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9767; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 9768; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9769; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9770; GFX908-NEXT: buffer_wbinvl1 9771; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9772; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9773; GFX908-NEXT: v_mov_b32_e32 v4, v3 9774; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9775; GFX908-NEXT: s_cbranch_execnz .LBB41_1 9776; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9777; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9778; GFX908-NEXT: s_setpc_b64 s[30:31] 9779; 9780; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9781; GFX8: ; %bb.0: 9782; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9783; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 9784; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9785; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9786; GFX8-NEXT: flat_load_dword v4, v[0:1] 9787; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9788; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9789; GFX8-NEXT: s_mov_b32 s4, 0xffff 9790; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9791; GFX8-NEXT: v_not_b32_e32 v6, v3 9792; GFX8-NEXT: s_mov_b64 s[4:5], 0 9793; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start 9794; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9795; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9796; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9797; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 9798; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 9799; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9800; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 9801; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9802; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9803; GFX8-NEXT: buffer_wbinvl1 9804; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9805; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9806; GFX8-NEXT: v_mov_b32_e32 v4, v3 9807; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9808; GFX8-NEXT: s_cbranch_execnz .LBB41_1 9809; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9810; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9811; GFX8-NEXT: s_setpc_b64 s[30:31] 9812; 9813; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 9814; GFX7: ; %bb.0: 9815; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9816; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 9817; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9818; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 9819; GFX7-NEXT: flat_load_dword v3, v[0:1] 9820; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 9821; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 9822; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 9823; GFX7-NEXT: s_mov_b64 s[4:5], 0 9824; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 9825; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 9826; GFX7-NEXT: v_not_b32_e32 v6, v2 9827; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start 9828; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9829; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9830; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 9831; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 9832; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 9833; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 9834; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 9835; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 9836; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 9837; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9838; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9839; GFX7-NEXT: buffer_wbinvl1 9840; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 9841; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9842; GFX7-NEXT: v_mov_b32_e32 v3, v2 9843; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9844; GFX7-NEXT: s_cbranch_execnz .LBB41_1 9845; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9846; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9847; GFX7-NEXT: s_setpc_b64 s[30:31] 9848 %gep = getelementptr half, ptr %ptr, i64 -1024 9849 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9850 ret void 9851} 9852 9853define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 9854; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9855; GFX12: ; %bb.0: 9856; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9857; GFX12-NEXT: s_wait_expcnt 0x0 9858; GFX12-NEXT: s_wait_samplecnt 0x0 9859; GFX12-NEXT: s_wait_bvhcnt 0x0 9860; GFX12-NEXT: s_wait_kmcnt 0x0 9861; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 9862; GFX12-NEXT: s_mov_b32 s0, 0 9863; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start 9864; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9865; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9866; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 9867; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9868; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 9869; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 9870; GFX12-NEXT: s_wait_storecnt 0x0 9871; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9872; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9873; GFX12-NEXT: global_inv scope:SCOPE_DEV 9874; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9875; GFX12-NEXT: v_mov_b32_e32 v4, v3 9876; GFX12-NEXT: s_wait_alu 0xfffe 9877; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9878; GFX12-NEXT: s_wait_alu 0xfffe 9879; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9880; GFX12-NEXT: s_cbranch_execnz .LBB42_1 9881; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9882; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9883; GFX12-NEXT: s_wait_alu 0xfffe 9884; GFX12-NEXT: s_setpc_b64 s[30:31] 9885; 9886; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9887; GFX940: ; %bb.0: 9888; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9889; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 9890; GFX940-NEXT: s_mov_b64 s[0:1], 0 9891; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 9892; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start 9893; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9894; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9895; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 9896; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 9897; GFX940-NEXT: buffer_wbl2 sc1 9898; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 9899; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9900; GFX940-NEXT: buffer_inv sc1 9901; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 9902; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9903; GFX940-NEXT: v_mov_b32_e32 v5, v3 9904; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9905; GFX940-NEXT: s_cbranch_execnz .LBB42_1 9906; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9907; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9908; GFX940-NEXT: s_setpc_b64 s[30:31] 9909; 9910; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9911; GFX11: ; %bb.0: 9912; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9913; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 9914; GFX11-NEXT: s_mov_b32 s0, 0 9915; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start 9916; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9917; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9918; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 9919; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9920; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 9921; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 9922; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9923; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 9924; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9925; GFX11-NEXT: buffer_gl1_inv 9926; GFX11-NEXT: buffer_gl0_inv 9927; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9928; GFX11-NEXT: v_mov_b32_e32 v4, v3 9929; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9930; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9931; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9932; GFX11-NEXT: s_cbranch_execnz .LBB42_1 9933; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9934; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9935; GFX11-NEXT: s_setpc_b64 s[30:31] 9936; 9937; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9938; GFX10: ; %bb.0: 9939; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9940; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 9941; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9942; GFX10-NEXT: s_mov_b32 s4, 0 9943; GFX10-NEXT: flat_load_dword v4, v[0:1] 9944; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start 9945; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9946; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9947; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 9948; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 9949; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 9950; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9951; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9952; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9953; GFX10-NEXT: buffer_gl1_inv 9954; GFX10-NEXT: buffer_gl0_inv 9955; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9956; GFX10-NEXT: v_mov_b32_e32 v4, v3 9957; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9958; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9959; GFX10-NEXT: s_cbranch_execnz .LBB42_1 9960; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9961; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9962; GFX10-NEXT: s_setpc_b64 s[30:31] 9963; 9964; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9965; GFX90A: ; %bb.0: 9966; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9967; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 9968; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9969; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 9970; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start 9971; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9972; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9973; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 9974; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 9975; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 9976; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9977; GFX90A-NEXT: buffer_wbinvl1 9978; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 9979; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9980; GFX90A-NEXT: v_mov_b32_e32 v5, v3 9981; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9982; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 9983; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9984; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9985; GFX90A-NEXT: s_setpc_b64 s[30:31] 9986; 9987; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 9988; GFX908: ; %bb.0: 9989; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9990; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 9991; GFX908-NEXT: s_mov_b64 s[4:5], 0 9992; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 9993; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start 9994; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9995; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9996; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 9997; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 9998; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 9999; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10000; GFX908-NEXT: buffer_wbinvl1 10001; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10002; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10003; GFX908-NEXT: v_mov_b32_e32 v4, v3 10004; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10005; GFX908-NEXT: s_cbranch_execnz .LBB42_1 10006; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10007; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10008; GFX908-NEXT: s_setpc_b64 s[30:31] 10009; 10010; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 10011; GFX8: ; %bb.0: 10012; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10013; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 10014; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10015; GFX8-NEXT: flat_load_dword v4, v[0:1] 10016; GFX8-NEXT: s_mov_b64 s[4:5], 0 10017; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start 10018; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10019; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10020; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 10021; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 10022; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 10023; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10024; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10025; GFX8-NEXT: buffer_wbinvl1 10026; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10027; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10028; GFX8-NEXT: v_mov_b32_e32 v4, v3 10029; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10030; GFX8-NEXT: s_cbranch_execnz .LBB42_1 10031; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10032; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10033; GFX8-NEXT: s_setpc_b64 s[30:31] 10034; 10035; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 10036; GFX7: ; %bb.0: 10037; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10038; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 10039; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10040; GFX7-NEXT: flat_load_dword v3, v[0:1] 10041; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 10042; GFX7-NEXT: s_mov_b64 s[4:5], 0 10043; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 10044; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start 10045; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10046; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10047; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 10048; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 10049; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 10050; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 10051; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 10052; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10053; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10054; GFX7-NEXT: buffer_wbinvl1 10055; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10056; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10057; GFX7-NEXT: v_mov_b32_e32 v3, v2 10058; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10059; GFX7-NEXT: s_cbranch_execnz .LBB42_1 10060; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10061; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10062; GFX7-NEXT: s_setpc_b64 s[30:31] 10063 %gep = getelementptr half, ptr %ptr, i64 1023 10064 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 10065 ret void 10066} 10067 10068define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 10069; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10070; GFX12: ; %bb.0: 10071; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10072; GFX12-NEXT: s_wait_expcnt 0x0 10073; GFX12-NEXT: s_wait_samplecnt 0x0 10074; GFX12-NEXT: s_wait_bvhcnt 0x0 10075; GFX12-NEXT: s_wait_kmcnt 0x0 10076; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 10077; GFX12-NEXT: s_mov_b32 s0, 0 10078; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start 10079; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10080; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10081; GFX12-NEXT: v_mov_b32_e32 v4, v3 10082; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10083; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 10084; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 10085; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10086; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 10087; GFX12-NEXT: s_wait_storecnt 0x0 10088; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10089; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10090; GFX12-NEXT: global_inv scope:SCOPE_DEV 10091; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10092; GFX12-NEXT: s_wait_alu 0xfffe 10093; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10094; GFX12-NEXT: s_wait_alu 0xfffe 10095; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10096; GFX12-NEXT: s_cbranch_execnz .LBB43_1 10097; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10098; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10099; GFX12-NEXT: v_mov_b32_e32 v0, v3 10100; GFX12-NEXT: s_wait_alu 0xfffe 10101; GFX12-NEXT: s_setpc_b64 s[30:31] 10102; 10103; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10104; GFX940: ; %bb.0: 10105; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10106; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 10107; GFX940-NEXT: s_mov_b64 s[0:1], 0 10108; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 10109; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start 10110; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10111; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10112; GFX940-NEXT: v_mov_b32_e32 v5, v3 10113; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 10114; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 10115; GFX940-NEXT: buffer_wbl2 sc1 10116; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 10117; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10118; GFX940-NEXT: buffer_inv sc1 10119; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 10120; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10121; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10122; GFX940-NEXT: s_cbranch_execnz .LBB43_1 10123; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10124; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10125; GFX940-NEXT: v_mov_b32_e32 v0, v3 10126; GFX940-NEXT: s_setpc_b64 s[30:31] 10127; 10128; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10129; GFX11: ; %bb.0: 10130; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10131; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 10132; GFX11-NEXT: s_mov_b32 s0, 0 10133; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start 10134; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10135; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10136; GFX11-NEXT: v_mov_b32_e32 v4, v3 10137; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10138; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 10139; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 10140; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10141; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 10142; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10143; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 10144; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10145; GFX11-NEXT: buffer_gl1_inv 10146; GFX11-NEXT: buffer_gl0_inv 10147; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10148; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10149; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10150; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10151; GFX11-NEXT: s_cbranch_execnz .LBB43_1 10152; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10153; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10154; GFX11-NEXT: v_mov_b32_e32 v0, v3 10155; GFX11-NEXT: s_setpc_b64 s[30:31] 10156; 10157; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10158; GFX10: ; %bb.0: 10159; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10160; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10161; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 10162; GFX10-NEXT: s_mov_b32 s4, 0 10163; GFX10-NEXT: flat_load_dword v0, v[3:4] 10164; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start 10165; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10166; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10167; GFX10-NEXT: v_mov_b32_e32 v1, v0 10168; GFX10-NEXT: v_add_f16_e32 v0, v1, v2 10169; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 10170; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 10171; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10172; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 10173; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10174; GFX10-NEXT: buffer_gl1_inv 10175; GFX10-NEXT: buffer_gl0_inv 10176; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 10177; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10178; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10179; GFX10-NEXT: s_cbranch_execnz .LBB43_1 10180; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10181; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10182; GFX10-NEXT: s_setpc_b64 s[30:31] 10183; 10184; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10185; GFX90A: ; %bb.0: 10186; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10187; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 10188; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10189; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 10190; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start 10191; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10192; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10193; GFX90A-NEXT: v_mov_b32_e32 v5, v3 10194; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 10195; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 10196; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 10197; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10198; GFX90A-NEXT: buffer_wbinvl1 10199; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 10200; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10201; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10202; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 10203; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10204; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10205; GFX90A-NEXT: v_mov_b32_e32 v0, v3 10206; GFX90A-NEXT: s_setpc_b64 s[30:31] 10207; 10208; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10209; GFX908: ; %bb.0: 10210; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10211; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 10212; GFX908-NEXT: s_mov_b64 s[4:5], 0 10213; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 10214; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start 10215; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10216; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10217; GFX908-NEXT: v_mov_b32_e32 v4, v3 10218; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 10219; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 10220; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 10221; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10222; GFX908-NEXT: buffer_wbinvl1 10223; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10224; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10225; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10226; GFX908-NEXT: s_cbranch_execnz .LBB43_1 10227; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10228; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10229; GFX908-NEXT: v_mov_b32_e32 v0, v3 10230; GFX908-NEXT: s_setpc_b64 s[30:31] 10231; 10232; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10233; GFX8: ; %bb.0: 10234; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10235; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 10236; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 10237; GFX8-NEXT: flat_load_dword v0, v[3:4] 10238; GFX8-NEXT: s_mov_b64 s[4:5], 0 10239; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start 10240; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10241; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10242; GFX8-NEXT: v_mov_b32_e32 v1, v0 10243; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 10244; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10245; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 10246; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 10247; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10248; GFX8-NEXT: buffer_wbinvl1 10249; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 10250; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10251; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10252; GFX8-NEXT: s_cbranch_execnz .LBB43_1 10253; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10254; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10255; GFX8-NEXT: s_setpc_b64 s[30:31] 10256; 10257; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10258; GFX7: ; %bb.0: 10259; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10260; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 10261; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10262; GFX7-NEXT: flat_load_dword v3, v[0:1] 10263; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 10264; GFX7-NEXT: s_mov_b64 s[4:5], 0 10265; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 10266; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start 10267; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10268; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10269; GFX7-NEXT: v_mov_b32_e32 v4, v3 10270; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 10271; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 10272; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 10273; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 10274; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 10275; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10276; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10277; GFX7-NEXT: buffer_wbinvl1 10278; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10279; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10280; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10281; GFX7-NEXT: s_cbranch_execnz .LBB43_1 10282; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10283; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10284; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 10285; GFX7-NEXT: s_setpc_b64 s[30:31] 10286 %gep = getelementptr half, ptr %ptr, i64 1023 10287 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 10288 ret half %result 10289} 10290 10291define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 10292; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10293; GFX12: ; %bb.0: 10294; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10295; GFX12-NEXT: s_wait_expcnt 0x0 10296; GFX12-NEXT: s_wait_samplecnt 0x0 10297; GFX12-NEXT: s_wait_bvhcnt 0x0 10298; GFX12-NEXT: s_wait_kmcnt 0x0 10299; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10300; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10301; GFX12-NEXT: s_mov_b32 s0, 0 10302; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10303; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 10304; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 10305; GFX12-NEXT: flat_load_b32 v5, v[0:1] 10306; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10307; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 10308; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10309; GFX12-NEXT: v_not_b32_e32 v4, v4 10310; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start 10311; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10312; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10313; GFX12-NEXT: v_mov_b32_e32 v6, v5 10314; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10315; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 10316; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 10317; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10318; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 10319; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 10320; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10321; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 10322; GFX12-NEXT: global_wb scope:SCOPE_SYS 10323; GFX12-NEXT: s_wait_storecnt 0x0 10324; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 10325; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10326; GFX12-NEXT: global_inv scope:SCOPE_SYS 10327; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 10328; GFX12-NEXT: s_wait_alu 0xfffe 10329; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10330; GFX12-NEXT: s_wait_alu 0xfffe 10331; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10332; GFX12-NEXT: s_cbranch_execnz .LBB44_1 10333; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10334; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10335; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10336; GFX12-NEXT: s_wait_alu 0xfffe 10337; GFX12-NEXT: s_setpc_b64 s[30:31] 10338; 10339; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10340; GFX940: ; %bb.0: 10341; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10342; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 10343; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 10344; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 10345; GFX940-NEXT: v_mov_b32_e32 v1, v7 10346; GFX940-NEXT: flat_load_dword v4, v[0:1] 10347; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 10348; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10349; GFX940-NEXT: s_mov_b32 s0, 0xffff 10350; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 10351; GFX940-NEXT: v_not_b32_e32 v5, v5 10352; GFX940-NEXT: s_mov_b64 s[0:1], 0 10353; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start 10354; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10355; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10356; GFX940-NEXT: v_mov_b32_e32 v7, v4 10357; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 10358; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 10359; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 10360; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 10361; GFX940-NEXT: buffer_wbl2 sc0 sc1 10362; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 10363; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10364; GFX940-NEXT: buffer_inv sc0 sc1 10365; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 10366; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10367; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10368; GFX940-NEXT: s_cbranch_execnz .LBB44_1 10369; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10370; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10371; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 10372; GFX940-NEXT: s_setpc_b64 s[30:31] 10373; 10374; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10375; GFX11: ; %bb.0: 10376; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10377; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10378; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10379; GFX11-NEXT: s_mov_b32 s0, 0 10380; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10381; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 10382; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 10383; GFX11-NEXT: flat_load_b32 v5, v[0:1] 10384; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10385; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 10386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10387; GFX11-NEXT: v_not_b32_e32 v4, v4 10388; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start 10389; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10390; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10391; GFX11-NEXT: v_mov_b32_e32 v6, v5 10392; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10393; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 10394; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 10395; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10396; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 10397; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 10398; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10399; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 10400; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10401; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 10402; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10403; GFX11-NEXT: buffer_gl1_inv 10404; GFX11-NEXT: buffer_gl0_inv 10405; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 10406; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10407; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10408; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10409; GFX11-NEXT: s_cbranch_execnz .LBB44_1 10410; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10411; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10412; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10413; GFX11-NEXT: s_setpc_b64 s[30:31] 10414; 10415; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10416; GFX10: ; %bb.0: 10417; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10418; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10419; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10420; GFX10-NEXT: s_mov_b32 s4, 0 10421; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 10422; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 10423; GFX10-NEXT: flat_load_dword v5, v[0:1] 10424; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10425; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 10426; GFX10-NEXT: v_not_b32_e32 v4, v4 10427; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start 10428; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10429; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10430; GFX10-NEXT: v_mov_b32_e32 v6, v5 10431; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 10432; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 10433; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 10434; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 10435; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10436; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 10437; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10438; GFX10-NEXT: buffer_gl1_inv 10439; GFX10-NEXT: buffer_gl0_inv 10440; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 10441; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10442; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10443; GFX10-NEXT: s_cbranch_execnz .LBB44_1 10444; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10445; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10446; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10447; GFX10-NEXT: s_setpc_b64 s[30:31] 10448; 10449; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10450; GFX90A: ; %bb.0: 10451; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10452; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 10453; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10454; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 10455; GFX90A-NEXT: flat_load_dword v4, v[0:1] 10456; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 10457; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10458; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10459; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 10460; GFX90A-NEXT: v_not_b32_e32 v5, v5 10461; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10462; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start 10463; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10464; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10465; GFX90A-NEXT: v_mov_b32_e32 v7, v4 10466; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 10467; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 10468; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 10469; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 10470; GFX90A-NEXT: buffer_wbl2 10471; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 10472; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10473; GFX90A-NEXT: buffer_invl2 10474; GFX90A-NEXT: buffer_wbinvl1 10475; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 10476; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10477; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10478; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 10479; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10480; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10481; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 10482; GFX90A-NEXT: s_setpc_b64 s[30:31] 10483; 10484; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10485; GFX908: ; %bb.0: 10486; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10487; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 10488; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10489; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 10490; GFX908-NEXT: flat_load_dword v4, v[0:1] 10491; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 10492; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10493; GFX908-NEXT: s_mov_b32 s4, 0xffff 10494; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 10495; GFX908-NEXT: v_not_b32_e32 v5, v5 10496; GFX908-NEXT: s_mov_b64 s[4:5], 0 10497; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start 10498; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10499; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10500; GFX908-NEXT: v_mov_b32_e32 v7, v4 10501; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 10502; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 10503; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 10504; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 10505; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 10506; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10507; GFX908-NEXT: buffer_wbinvl1 10508; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 10509; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10510; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10511; GFX908-NEXT: s_cbranch_execnz .LBB44_1 10512; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10513; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10514; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 10515; GFX908-NEXT: s_setpc_b64 s[30:31] 10516; 10517; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10518; GFX8: ; %bb.0: 10519; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10520; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 10521; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10522; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 10523; GFX8-NEXT: flat_load_dword v5, v[0:1] 10524; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 10525; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10526; GFX8-NEXT: s_mov_b32 s4, 0xffff 10527; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 10528; GFX8-NEXT: v_not_b32_e32 v4, v4 10529; GFX8-NEXT: s_mov_b64 s[4:5], 0 10530; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start 10531; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10532; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10533; GFX8-NEXT: v_mov_b32_e32 v6, v5 10534; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 10535; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 10536; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 10537; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 10538; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 10539; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 10540; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10541; GFX8-NEXT: buffer_wbinvl1 10542; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 10543; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10544; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10545; GFX8-NEXT: s_cbranch_execnz .LBB44_1 10546; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10547; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10548; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10549; GFX8-NEXT: s_setpc_b64 s[30:31] 10550; 10551; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10552; GFX7: ; %bb.0: 10553; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10554; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 10555; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10556; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 10557; GFX7-NEXT: flat_load_dword v5, v[0:1] 10558; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 10559; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 10560; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 10561; GFX7-NEXT: s_mov_b64 s[4:5], 0 10562; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 10563; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 10564; GFX7-NEXT: v_not_b32_e32 v4, v4 10565; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start 10566; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10567; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10568; GFX7-NEXT: v_mov_b32_e32 v6, v5 10569; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 10570; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 10571; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 10572; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 10573; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 10574; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 10575; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 10576; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 10577; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10578; GFX7-NEXT: buffer_wbinvl1 10579; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 10580; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10581; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10582; GFX7-NEXT: s_cbranch_execnz .LBB44_1 10583; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10584; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10585; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 10586; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 10587; GFX7-NEXT: s_setpc_b64 s[30:31] 10588 %gep = getelementptr half, ptr %ptr, i64 1023 10589 %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 10590 ret half %result 10591} 10592 10593define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 10594; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10595; GFX12: ; %bb.0: 10596; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10597; GFX12-NEXT: s_wait_expcnt 0x0 10598; GFX12-NEXT: s_wait_samplecnt 0x0 10599; GFX12-NEXT: s_wait_bvhcnt 0x0 10600; GFX12-NEXT: s_wait_kmcnt 0x0 10601; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10602; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10603; GFX12-NEXT: s_mov_b32 s0, 0 10604; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10605; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 10606; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 10607; GFX12-NEXT: flat_load_b32 v4, v[0:1] 10608; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10609; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 10610; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10611; GFX12-NEXT: v_not_b32_e32 v6, v3 10612; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start 10613; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10614; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10615; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10616; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10617; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 10618; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 10619; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10620; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10621; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 10622; GFX12-NEXT: global_wb scope:SCOPE_SYS 10623; GFX12-NEXT: s_wait_storecnt 0x0 10624; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 10625; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10626; GFX12-NEXT: global_inv scope:SCOPE_SYS 10627; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10628; GFX12-NEXT: v_mov_b32_e32 v4, v3 10629; GFX12-NEXT: s_wait_alu 0xfffe 10630; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10631; GFX12-NEXT: s_wait_alu 0xfffe 10632; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10633; GFX12-NEXT: s_cbranch_execnz .LBB45_1 10634; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10635; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10636; GFX12-NEXT: s_wait_alu 0xfffe 10637; GFX12-NEXT: s_setpc_b64 s[30:31] 10638; 10639; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10640; GFX940: ; %bb.0: 10641; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10642; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 10643; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 10644; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 10645; GFX940-NEXT: v_mov_b32_e32 v1, v5 10646; GFX940-NEXT: flat_load_dword v5, v[0:1] 10647; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 10648; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10649; GFX940-NEXT: s_mov_b32 s0, 0xffff 10650; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 10651; GFX940-NEXT: v_not_b32_e32 v6, v4 10652; GFX940-NEXT: s_mov_b64 s[0:1], 0 10653; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start 10654; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10655; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10656; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 10657; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 10658; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 10659; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 10660; GFX940-NEXT: buffer_wbl2 sc0 sc1 10661; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 10662; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10663; GFX940-NEXT: buffer_inv sc0 sc1 10664; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 10665; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10666; GFX940-NEXT: v_mov_b32_e32 v5, v4 10667; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10668; GFX940-NEXT: s_cbranch_execnz .LBB45_1 10669; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10670; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10671; GFX940-NEXT: s_setpc_b64 s[30:31] 10672; 10673; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10674; GFX11: ; %bb.0: 10675; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10676; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10677; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10678; GFX11-NEXT: s_mov_b32 s0, 0 10679; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10680; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 10681; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 10682; GFX11-NEXT: flat_load_b32 v4, v[0:1] 10683; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10684; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 10685; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10686; GFX11-NEXT: v_not_b32_e32 v6, v3 10687; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start 10688; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10689; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10690; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10691; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10692; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 10693; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 10694; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10695; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10696; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 10697; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10698; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 10699; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10700; GFX11-NEXT: buffer_gl1_inv 10701; GFX11-NEXT: buffer_gl0_inv 10702; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10703; GFX11-NEXT: v_mov_b32_e32 v4, v3 10704; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10705; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10706; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10707; GFX11-NEXT: s_cbranch_execnz .LBB45_1 10708; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10709; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10710; GFX11-NEXT: s_setpc_b64 s[30:31] 10711; 10712; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10713; GFX10: ; %bb.0: 10714; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10715; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10716; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10717; GFX10-NEXT: s_mov_b32 s4, 0 10718; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 10719; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 10720; GFX10-NEXT: flat_load_dword v4, v[0:1] 10721; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10722; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 10723; GFX10-NEXT: v_not_b32_e32 v6, v3 10724; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start 10725; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10726; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10727; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10728; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 10729; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 10730; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 10731; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10732; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10733; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10734; GFX10-NEXT: buffer_gl1_inv 10735; GFX10-NEXT: buffer_gl0_inv 10736; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10737; GFX10-NEXT: v_mov_b32_e32 v4, v3 10738; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10739; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10740; GFX10-NEXT: s_cbranch_execnz .LBB45_1 10741; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10742; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10743; GFX10-NEXT: s_setpc_b64 s[30:31] 10744; 10745; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10746; GFX90A: ; %bb.0: 10747; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10748; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 10749; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10750; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 10751; GFX90A-NEXT: flat_load_dword v5, v[0:1] 10752; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 10753; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10754; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10755; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 10756; GFX90A-NEXT: v_not_b32_e32 v6, v4 10757; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10758; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start 10759; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10760; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10761; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 10762; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 10763; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 10764; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 10765; GFX90A-NEXT: buffer_wbl2 10766; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 10767; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10768; GFX90A-NEXT: buffer_invl2 10769; GFX90A-NEXT: buffer_wbinvl1 10770; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 10771; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10772; GFX90A-NEXT: v_mov_b32_e32 v5, v4 10773; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10774; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 10775; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10776; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10777; GFX90A-NEXT: s_setpc_b64 s[30:31] 10778; 10779; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10780; GFX908: ; %bb.0: 10781; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10782; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 10783; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10784; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 10785; GFX908-NEXT: flat_load_dword v4, v[0:1] 10786; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 10787; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10788; GFX908-NEXT: s_mov_b32 s4, 0xffff 10789; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 10790; GFX908-NEXT: v_not_b32_e32 v6, v3 10791; GFX908-NEXT: s_mov_b64 s[4:5], 0 10792; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start 10793; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10794; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10795; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10796; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 10797; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10798; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 10799; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10800; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10801; GFX908-NEXT: buffer_wbinvl1 10802; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10803; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10804; GFX908-NEXT: v_mov_b32_e32 v4, v3 10805; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10806; GFX908-NEXT: s_cbranch_execnz .LBB45_1 10807; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10808; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10809; GFX908-NEXT: s_setpc_b64 s[30:31] 10810; 10811; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10812; GFX8: ; %bb.0: 10813; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10814; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 10815; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10816; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 10817; GFX8-NEXT: flat_load_dword v4, v[0:1] 10818; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 10819; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10820; GFX8-NEXT: s_mov_b32 s4, 0xffff 10821; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 10822; GFX8-NEXT: v_not_b32_e32 v6, v3 10823; GFX8-NEXT: s_mov_b64 s[4:5], 0 10824; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start 10825; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10826; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10827; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10828; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 10829; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 10830; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10831; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 10832; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10833; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10834; GFX8-NEXT: buffer_wbinvl1 10835; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10836; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10837; GFX8-NEXT: v_mov_b32_e32 v4, v3 10838; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10839; GFX8-NEXT: s_cbranch_execnz .LBB45_1 10840; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10841; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10842; GFX8-NEXT: s_setpc_b64 s[30:31] 10843; 10844; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 10845; GFX7: ; %bb.0: 10846; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10847; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 10848; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10849; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 10850; GFX7-NEXT: flat_load_dword v3, v[0:1] 10851; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 10852; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 10853; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10854; GFX7-NEXT: s_mov_b64 s[4:5], 0 10855; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 10856; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 10857; GFX7-NEXT: v_not_b32_e32 v6, v2 10858; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start 10859; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10860; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10861; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10862; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 10863; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 10864; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 10865; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 10866; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10867; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 10868; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10869; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10870; GFX7-NEXT: buffer_wbinvl1 10871; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10872; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10873; GFX7-NEXT: v_mov_b32_e32 v3, v2 10874; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10875; GFX7-NEXT: s_cbranch_execnz .LBB45_1 10876; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10877; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10878; GFX7-NEXT: s_setpc_b64 s[30:31] 10879 %gep = getelementptr half, ptr %ptr, i64 1023 10880 %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 10881 ret void 10882} 10883 10884; -------------------------------------------------------------------- 10885; bfloat 10886; -------------------------------------------------------------------- 10887 10888define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 10889; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 10890; GFX12: ; %bb.0: 10891; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10892; GFX12-NEXT: s_wait_expcnt 0x0 10893; GFX12-NEXT: s_wait_samplecnt 0x0 10894; GFX12-NEXT: s_wait_bvhcnt 0x0 10895; GFX12-NEXT: s_wait_kmcnt 0x0 10896; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 10897; GFX12-NEXT: s_mov_b32 s0, 0 10898; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10899; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 10900; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 10901; GFX12-NEXT: flat_load_b32 v5, v[0:1] 10902; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10903; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 10904; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10905; GFX12-NEXT: v_not_b32_e32 v4, v4 10906; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start 10907; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10908; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10909; GFX12-NEXT: v_mov_b32_e32 v6, v5 10910; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10911; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 10912; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 10913; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10914; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 10915; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 10916; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 10917; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10918; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 10919; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 10920; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 10921; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10922; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 10923; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 10924; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10925; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 10926; GFX12-NEXT: s_wait_storecnt 0x0 10927; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10928; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10929; GFX12-NEXT: global_inv scope:SCOPE_DEV 10930; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 10931; GFX12-NEXT: s_wait_alu 0xfffe 10932; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10933; GFX12-NEXT: s_wait_alu 0xfffe 10934; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10935; GFX12-NEXT: s_cbranch_execnz .LBB46_1 10936; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10937; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10938; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10939; GFX12-NEXT: s_wait_alu 0xfffe 10940; GFX12-NEXT: s_setpc_b64 s[30:31] 10941; 10942; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 10943; GFX940: ; %bb.0: 10944; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10945; GFX940-NEXT: v_mov_b32_e32 v3, v0 10946; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 10947; GFX940-NEXT: flat_load_dword v5, v[0:1] 10948; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 10949; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10950; GFX940-NEXT: s_mov_b32 s0, 0xffff 10951; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 10952; GFX940-NEXT: v_not_b32_e32 v4, v4 10953; GFX940-NEXT: s_mov_b64 s[0:1], 0 10954; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10955; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10956; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start 10957; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10958; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10959; GFX940-NEXT: v_mov_b32_e32 v7, v5 10960; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10961; GFX940-NEXT: s_nop 0 10962; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 10963; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 10964; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 10965; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 10966; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10967; GFX940-NEXT: s_nop 1 10968; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 10969; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10970; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 10971; GFX940-NEXT: buffer_wbl2 sc1 10972; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 10973; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10974; GFX940-NEXT: buffer_inv sc1 10975; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 10976; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10977; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10978; GFX940-NEXT: s_cbranch_execnz .LBB46_1 10979; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10980; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10981; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 10982; GFX940-NEXT: s_setpc_b64 s[30:31] 10983; 10984; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 10985; GFX11: ; %bb.0: 10986; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10987; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 10988; GFX11-NEXT: s_mov_b32 s0, 0 10989; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10990; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 10991; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 10992; GFX11-NEXT: flat_load_b32 v5, v[0:1] 10993; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10994; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 10995; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10996; GFX11-NEXT: v_not_b32_e32 v4, v4 10997; GFX11-NEXT: .p2align 6 10998; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start 10999; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11000; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11001; GFX11-NEXT: v_mov_b32_e32 v6, v5 11002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11003; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11004; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11005; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11006; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 11007; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11008; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 11009; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11010; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11011; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11012; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11013; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11014; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11015; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11017; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 11018; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11019; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 11020; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11021; GFX11-NEXT: buffer_gl1_inv 11022; GFX11-NEXT: buffer_gl0_inv 11023; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11024; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11025; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11026; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11027; GFX11-NEXT: s_cbranch_execnz .LBB46_1 11028; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11029; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11030; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11031; GFX11-NEXT: s_setpc_b64 s[30:31] 11032; 11033; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 11034; GFX10: ; %bb.0: 11035; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11036; GFX10-NEXT: v_mov_b32_e32 v3, v0 11037; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11038; GFX10-NEXT: s_mov_b32 s4, 0 11039; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 11040; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 11041; GFX10-NEXT: flat_load_dword v5, v[0:1] 11042; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11043; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11044; GFX10-NEXT: v_not_b32_e32 v4, v4 11045; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start 11046; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11047; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11048; GFX10-NEXT: v_mov_b32_e32 v6, v5 11049; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11050; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 11051; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11052; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 11053; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11054; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11055; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11056; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11057; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 11058; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11059; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11060; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11061; GFX10-NEXT: buffer_gl1_inv 11062; GFX10-NEXT: buffer_gl0_inv 11063; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11064; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11065; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11066; GFX10-NEXT: s_cbranch_execnz .LBB46_1 11067; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11068; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11069; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11070; GFX10-NEXT: s_setpc_b64 s[30:31] 11071; 11072; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 11073; GFX90A: ; %bb.0: 11074; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11075; GFX90A-NEXT: v_mov_b32_e32 v3, v0 11076; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 11077; GFX90A-NEXT: flat_load_dword v5, v[0:1] 11078; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 11079; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11080; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11081; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11082; GFX90A-NEXT: v_not_b32_e32 v4, v4 11083; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11084; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11085; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11086; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start 11087; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11088; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11089; GFX90A-NEXT: v_mov_b32_e32 v7, v5 11090; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11091; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 11092; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 11093; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 11094; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 11095; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11096; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11097; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11098; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 11099; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 11100; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11101; GFX90A-NEXT: buffer_wbinvl1 11102; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11103; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11104; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11105; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 11106; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11107; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11108; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11109; GFX90A-NEXT: s_setpc_b64 s[30:31] 11110; 11111; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 11112; GFX908: ; %bb.0: 11113; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11114; GFX908-NEXT: v_mov_b32_e32 v3, v0 11115; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 11116; GFX908-NEXT: flat_load_dword v5, v[0:1] 11117; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 11118; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11119; GFX908-NEXT: s_mov_b32 s4, 0xffff 11120; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11121; GFX908-NEXT: v_not_b32_e32 v4, v4 11122; GFX908-NEXT: s_mov_b64 s[4:5], 0 11123; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11124; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11125; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start 11126; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11127; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11128; GFX908-NEXT: v_mov_b32_e32 v6, v5 11129; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11130; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 11131; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 11132; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 11133; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 11134; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11135; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 11136; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11137; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 11138; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11139; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11140; GFX908-NEXT: buffer_wbinvl1 11141; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11142; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11143; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11144; GFX908-NEXT: s_cbranch_execnz .LBB46_1 11145; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11146; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11147; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11148; GFX908-NEXT: s_setpc_b64 s[30:31] 11149; 11150; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 11151; GFX8: ; %bb.0: 11152; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11153; GFX8-NEXT: v_mov_b32_e32 v3, v0 11154; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 11155; GFX8-NEXT: flat_load_dword v5, v[0:1] 11156; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 11157; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11158; GFX8-NEXT: s_mov_b32 s4, 0xffff 11159; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11160; GFX8-NEXT: v_not_b32_e32 v4, v4 11161; GFX8-NEXT: s_mov_b64 s[4:5], 0 11162; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11163; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start 11164; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11165; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11166; GFX8-NEXT: v_mov_b32_e32 v6, v5 11167; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11168; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 11169; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11170; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11171; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11172; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11173; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11174; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11175; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 11176; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11177; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 11178; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11179; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11180; GFX8-NEXT: buffer_wbinvl1 11181; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11182; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11183; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11184; GFX8-NEXT: s_cbranch_execnz .LBB46_1 11185; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11186; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11187; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11188; GFX8-NEXT: s_setpc_b64 s[30:31] 11189; 11190; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: 11191; GFX7: ; %bb.0: 11192; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11193; GFX7-NEXT: v_mov_b32_e32 v3, v0 11194; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 11195; GFX7-NEXT: flat_load_dword v5, v[0:1] 11196; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 11197; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11198; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 11199; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11200; GFX7-NEXT: v_not_b32_e32 v4, v4 11201; GFX7-NEXT: s_mov_b64 s[4:5], 0 11202; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11203; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start 11204; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11205; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11206; GFX7-NEXT: v_mov_b32_e32 v6, v5 11207; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11208; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11209; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 11210; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11211; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 11212; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11213; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 11214; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11215; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11216; GFX7-NEXT: buffer_wbinvl1 11217; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11218; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11219; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11220; GFX7-NEXT: s_cbranch_execnz .LBB46_1 11221; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11222; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11223; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11224; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11225; GFX7-NEXT: s_setpc_b64 s[30:31] 11226 %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 11227 ret bfloat %result 11228} 11229 11230define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11231; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11232; GFX12: ; %bb.0: 11233; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11234; GFX12-NEXT: s_wait_expcnt 0x0 11235; GFX12-NEXT: s_wait_samplecnt 0x0 11236; GFX12-NEXT: s_wait_bvhcnt 0x0 11237; GFX12-NEXT: s_wait_kmcnt 0x0 11238; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11239; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11240; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11241; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11242; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 11243; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 11244; GFX12-NEXT: s_mov_b32 s0, 0 11245; GFX12-NEXT: flat_load_b32 v5, v[0:1] 11246; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11247; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11248; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11249; GFX12-NEXT: v_not_b32_e32 v4, v4 11250; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start 11251; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11252; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11253; GFX12-NEXT: v_mov_b32_e32 v6, v5 11254; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11255; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11256; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11257; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11258; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 11259; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 11260; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 11261; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11262; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11263; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11264; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11265; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11266; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11267; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11268; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11269; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 11270; GFX12-NEXT: s_wait_storecnt 0x0 11271; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11272; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11273; GFX12-NEXT: global_inv scope:SCOPE_DEV 11274; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11275; GFX12-NEXT: s_wait_alu 0xfffe 11276; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11277; GFX12-NEXT: s_wait_alu 0xfffe 11278; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11279; GFX12-NEXT: s_cbranch_execnz .LBB47_1 11280; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11281; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11282; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11283; GFX12-NEXT: s_wait_alu 0xfffe 11284; GFX12-NEXT: s_setpc_b64 s[30:31] 11285; 11286; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11287; GFX940: ; %bb.0: 11288; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11289; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11290; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11291; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11292; GFX940-NEXT: v_mov_b32_e32 v1, v5 11293; GFX940-NEXT: flat_load_dword v5, v[0:1] 11294; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 11295; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11296; GFX940-NEXT: s_mov_b32 s0, 0xffff 11297; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 11298; GFX940-NEXT: v_not_b32_e32 v4, v4 11299; GFX940-NEXT: s_mov_b64 s[0:1], 0 11300; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11301; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11302; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start 11303; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11304; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11305; GFX940-NEXT: v_mov_b32_e32 v7, v5 11306; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11307; GFX940-NEXT: s_nop 0 11308; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 11309; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 11310; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 11311; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 11312; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11313; GFX940-NEXT: s_nop 1 11314; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11315; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11316; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 11317; GFX940-NEXT: buffer_wbl2 sc1 11318; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 11319; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11320; GFX940-NEXT: buffer_inv sc1 11321; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11322; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11323; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11324; GFX940-NEXT: s_cbranch_execnz .LBB47_1 11325; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11326; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11327; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11328; GFX940-NEXT: s_setpc_b64 s[30:31] 11329; 11330; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11331; GFX11: ; %bb.0: 11332; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11333; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11334; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11335; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11336; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11337; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 11338; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 11339; GFX11-NEXT: s_mov_b32 s0, 0 11340; GFX11-NEXT: flat_load_b32 v5, v[0:1] 11341; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11342; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11344; GFX11-NEXT: v_not_b32_e32 v4, v4 11345; GFX11-NEXT: .p2align 6 11346; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start 11347; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11348; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11349; GFX11-NEXT: v_mov_b32_e32 v6, v5 11350; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11351; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11352; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11354; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 11355; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11356; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 11357; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11359; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11360; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11362; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11363; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11365; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 11366; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11367; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 11368; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11369; GFX11-NEXT: buffer_gl1_inv 11370; GFX11-NEXT: buffer_gl0_inv 11371; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11372; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11373; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11374; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11375; GFX11-NEXT: s_cbranch_execnz .LBB47_1 11376; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11377; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11378; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11379; GFX11-NEXT: s_setpc_b64 s[30:31] 11380; 11381; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11382; GFX10: ; %bb.0: 11383; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11384; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11385; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11386; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11387; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 11388; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 11389; GFX10-NEXT: s_mov_b32 s4, 0 11390; GFX10-NEXT: flat_load_dword v5, v[0:1] 11391; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11392; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11393; GFX10-NEXT: v_not_b32_e32 v4, v4 11394; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start 11395; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11396; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11397; GFX10-NEXT: v_mov_b32_e32 v6, v5 11398; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11399; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 11400; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11401; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 11402; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11403; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11404; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11405; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11406; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 11407; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11408; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11409; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11410; GFX10-NEXT: buffer_gl1_inv 11411; GFX10-NEXT: buffer_gl0_inv 11412; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11413; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11414; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11415; GFX10-NEXT: s_cbranch_execnz .LBB47_1 11416; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11417; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11418; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11419; GFX10-NEXT: s_setpc_b64 s[30:31] 11420; 11421; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11422; GFX90A: ; %bb.0: 11423; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11424; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11425; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11426; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 11427; GFX90A-NEXT: flat_load_dword v5, v[0:1] 11428; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 11429; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11430; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11431; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11432; GFX90A-NEXT: v_not_b32_e32 v4, v4 11433; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11434; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11435; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11436; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start 11437; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11438; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11439; GFX90A-NEXT: v_mov_b32_e32 v7, v5 11440; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11441; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 11442; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 11443; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 11444; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 11445; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11446; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11447; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11448; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 11449; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 11450; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11451; GFX90A-NEXT: buffer_wbinvl1 11452; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11453; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11454; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11455; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 11456; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11457; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11458; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11459; GFX90A-NEXT: s_setpc_b64 s[30:31] 11460; 11461; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11462; GFX908: ; %bb.0: 11463; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11464; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11465; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11466; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 11467; GFX908-NEXT: flat_load_dword v5, v[0:1] 11468; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 11469; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11470; GFX908-NEXT: s_mov_b32 s4, 0xffff 11471; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11472; GFX908-NEXT: v_not_b32_e32 v4, v4 11473; GFX908-NEXT: s_mov_b64 s[4:5], 0 11474; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11475; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11476; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start 11477; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11478; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11479; GFX908-NEXT: v_mov_b32_e32 v6, v5 11480; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11481; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 11482; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 11483; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 11484; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 11485; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11486; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 11487; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11488; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 11489; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11490; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11491; GFX908-NEXT: buffer_wbinvl1 11492; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11493; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11494; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11495; GFX908-NEXT: s_cbranch_execnz .LBB47_1 11496; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11497; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11498; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11499; GFX908-NEXT: s_setpc_b64 s[30:31] 11500; 11501; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11502; GFX8: ; %bb.0: 11503; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11504; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 11505; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11506; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 11507; GFX8-NEXT: flat_load_dword v5, v[0:1] 11508; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 11509; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11510; GFX8-NEXT: s_mov_b32 s4, 0xffff 11511; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11512; GFX8-NEXT: v_not_b32_e32 v4, v4 11513; GFX8-NEXT: s_mov_b64 s[4:5], 0 11514; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11515; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start 11516; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11517; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11518; GFX8-NEXT: v_mov_b32_e32 v6, v5 11519; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11520; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 11521; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11522; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11523; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11524; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11525; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11526; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11527; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 11528; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11529; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 11530; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11531; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11532; GFX8-NEXT: buffer_wbinvl1 11533; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11534; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11535; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11536; GFX8-NEXT: s_cbranch_execnz .LBB47_1 11537; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11538; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11539; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11540; GFX8-NEXT: s_setpc_b64 s[30:31] 11541; 11542; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11543; GFX7: ; %bb.0: 11544; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11545; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 11546; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11547; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 11548; GFX7-NEXT: flat_load_dword v5, v[0:1] 11549; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 11550; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11551; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 11552; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11553; GFX7-NEXT: v_not_b32_e32 v4, v4 11554; GFX7-NEXT: s_mov_b64 s[4:5], 0 11555; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11556; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start 11557; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11558; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11559; GFX7-NEXT: v_mov_b32_e32 v6, v5 11560; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11561; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11562; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 11563; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11564; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 11565; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11566; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 11567; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11568; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11569; GFX7-NEXT: buffer_wbinvl1 11570; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11571; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11572; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11573; GFX7-NEXT: s_cbranch_execnz .LBB47_1 11574; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11575; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11576; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11577; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11578; GFX7-NEXT: s_setpc_b64 s[30:31] 11579 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11580 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 11581 ret bfloat %result 11582} 11583 11584define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11585; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11586; GFX12: ; %bb.0: 11587; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11588; GFX12-NEXT: s_wait_expcnt 0x0 11589; GFX12-NEXT: s_wait_samplecnt 0x0 11590; GFX12-NEXT: s_wait_bvhcnt 0x0 11591; GFX12-NEXT: s_wait_kmcnt 0x0 11592; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 11593; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 11594; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11595; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11596; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 11597; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 11598; GFX12-NEXT: s_mov_b32 s0, 0 11599; GFX12-NEXT: flat_load_b32 v5, v[0:1] 11600; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11601; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11602; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11603; GFX12-NEXT: v_not_b32_e32 v4, v4 11604; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start 11605; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11606; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11607; GFX12-NEXT: v_mov_b32_e32 v6, v5 11608; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11609; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11610; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11611; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11612; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 11613; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 11614; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 11615; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11616; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11617; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11618; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11619; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11620; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11621; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11622; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11623; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 11624; GFX12-NEXT: s_wait_storecnt 0x0 11625; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11626; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11627; GFX12-NEXT: global_inv scope:SCOPE_DEV 11628; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11629; GFX12-NEXT: s_wait_alu 0xfffe 11630; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11631; GFX12-NEXT: s_wait_alu 0xfffe 11632; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11633; GFX12-NEXT: s_cbranch_execnz .LBB48_1 11634; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11635; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11636; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11637; GFX12-NEXT: s_wait_alu 0xfffe 11638; GFX12-NEXT: s_setpc_b64 s[30:31] 11639; 11640; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11641; GFX940: ; %bb.0: 11642; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11643; GFX940-NEXT: s_movk_i32 s0, 0xf800 11644; GFX940-NEXT: s_mov_b32 s1, -1 11645; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11646; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11647; GFX940-NEXT: v_mov_b32_e32 v1, v5 11648; GFX940-NEXT: flat_load_dword v5, v[0:1] 11649; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 11650; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11651; GFX940-NEXT: s_mov_b32 s0, 0xffff 11652; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 11653; GFX940-NEXT: v_not_b32_e32 v4, v4 11654; GFX940-NEXT: s_mov_b64 s[0:1], 0 11655; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11656; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11657; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start 11658; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11659; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11660; GFX940-NEXT: v_mov_b32_e32 v7, v5 11661; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11662; GFX940-NEXT: s_nop 0 11663; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 11664; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 11665; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 11666; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 11667; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11668; GFX940-NEXT: s_nop 1 11669; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11670; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11671; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 11672; GFX940-NEXT: buffer_wbl2 sc1 11673; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 11674; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11675; GFX940-NEXT: buffer_inv sc1 11676; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11677; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11678; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11679; GFX940-NEXT: s_cbranch_execnz .LBB48_1 11680; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11681; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11682; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11683; GFX940-NEXT: s_setpc_b64 s[30:31] 11684; 11685; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11686; GFX11: ; %bb.0: 11687; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11688; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 11689; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 11690; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11691; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11692; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 11693; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 11694; GFX11-NEXT: s_mov_b32 s0, 0 11695; GFX11-NEXT: flat_load_b32 v5, v[0:1] 11696; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11697; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11698; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11699; GFX11-NEXT: v_not_b32_e32 v4, v4 11700; GFX11-NEXT: .p2align 6 11701; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start 11702; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11703; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11704; GFX11-NEXT: v_mov_b32_e32 v6, v5 11705; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11706; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11707; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11709; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 11710; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11711; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 11712; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11714; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11715; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11717; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11718; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11720; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 11721; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11722; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 11723; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11724; GFX11-NEXT: buffer_gl1_inv 11725; GFX11-NEXT: buffer_gl0_inv 11726; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11727; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11728; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11729; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11730; GFX11-NEXT: s_cbranch_execnz .LBB48_1 11731; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11732; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11733; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11734; GFX11-NEXT: s_setpc_b64 s[30:31] 11735; 11736; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11737; GFX10: ; %bb.0: 11738; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11739; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 11740; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 11741; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11742; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 11743; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 11744; GFX10-NEXT: s_mov_b32 s4, 0 11745; GFX10-NEXT: flat_load_dword v5, v[0:1] 11746; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11747; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11748; GFX10-NEXT: v_not_b32_e32 v4, v4 11749; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start 11750; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11751; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11752; GFX10-NEXT: v_mov_b32_e32 v6, v5 11753; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11754; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 11755; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11756; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 11757; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11758; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11759; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11760; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11761; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 11762; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11763; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11764; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11765; GFX10-NEXT: buffer_gl1_inv 11766; GFX10-NEXT: buffer_gl0_inv 11767; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11768; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11769; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11770; GFX10-NEXT: s_cbranch_execnz .LBB48_1 11771; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11772; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11773; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11774; GFX10-NEXT: s_setpc_b64 s[30:31] 11775; 11776; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11777; GFX90A: ; %bb.0: 11778; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11779; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 11780; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 11781; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 11782; GFX90A-NEXT: flat_load_dword v5, v[0:1] 11783; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 11784; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11785; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11786; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11787; GFX90A-NEXT: v_not_b32_e32 v4, v4 11788; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11789; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11790; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11791; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start 11792; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11793; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11794; GFX90A-NEXT: v_mov_b32_e32 v7, v5 11795; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11796; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 11797; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 11798; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 11799; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 11800; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11801; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11802; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11803; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 11804; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 11805; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11806; GFX90A-NEXT: buffer_wbinvl1 11807; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11808; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11809; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11810; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 11811; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11812; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11813; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11814; GFX90A-NEXT: s_setpc_b64 s[30:31] 11815; 11816; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11817; GFX908: ; %bb.0: 11818; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11819; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 11820; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 11821; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 11822; GFX908-NEXT: flat_load_dword v5, v[0:1] 11823; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 11824; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11825; GFX908-NEXT: s_mov_b32 s4, 0xffff 11826; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11827; GFX908-NEXT: v_not_b32_e32 v4, v4 11828; GFX908-NEXT: s_mov_b64 s[4:5], 0 11829; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11830; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11831; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start 11832; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11833; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11834; GFX908-NEXT: v_mov_b32_e32 v6, v5 11835; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11836; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 11837; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 11838; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 11839; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 11840; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11841; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 11842; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11843; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 11844; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11845; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11846; GFX908-NEXT: buffer_wbinvl1 11847; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11848; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11849; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11850; GFX908-NEXT: s_cbranch_execnz .LBB48_1 11851; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11852; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11853; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11854; GFX908-NEXT: s_setpc_b64 s[30:31] 11855; 11856; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11857; GFX8: ; %bb.0: 11858; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11859; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 11860; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 11861; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 11862; GFX8-NEXT: flat_load_dword v5, v[0:1] 11863; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 11864; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11865; GFX8-NEXT: s_mov_b32 s4, 0xffff 11866; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11867; GFX8-NEXT: v_not_b32_e32 v4, v4 11868; GFX8-NEXT: s_mov_b64 s[4:5], 0 11869; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11870; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start 11871; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11872; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11873; GFX8-NEXT: v_mov_b32_e32 v6, v5 11874; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11875; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 11876; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11877; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11878; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11879; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11880; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11881; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11882; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 11883; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11884; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 11885; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11886; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11887; GFX8-NEXT: buffer_wbinvl1 11888; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11889; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11890; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11891; GFX8-NEXT: s_cbranch_execnz .LBB48_1 11892; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11893; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11894; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11895; GFX8-NEXT: s_setpc_b64 s[30:31] 11896; 11897; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 11898; GFX7: ; %bb.0: 11899; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11900; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 11901; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 11902; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 11903; GFX7-NEXT: flat_load_dword v5, v[0:1] 11904; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 11905; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11906; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 11907; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11908; GFX7-NEXT: v_not_b32_e32 v4, v4 11909; GFX7-NEXT: s_mov_b64 s[4:5], 0 11910; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11911; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start 11912; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11913; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11914; GFX7-NEXT: v_mov_b32_e32 v6, v5 11915; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11916; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11917; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 11918; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11919; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 11920; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11921; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 11922; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11923; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11924; GFX7-NEXT: buffer_wbinvl1 11925; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11926; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11927; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11928; GFX7-NEXT: s_cbranch_execnz .LBB48_1 11929; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11930; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11931; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11932; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11933; GFX7-NEXT: s_setpc_b64 s[30:31] 11934 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 11935 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 11936 ret bfloat %result 11937 } 11938 11939define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11940; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11941; GFX12: ; %bb.0: 11942; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11943; GFX12-NEXT: s_wait_expcnt 0x0 11944; GFX12-NEXT: s_wait_samplecnt 0x0 11945; GFX12-NEXT: s_wait_bvhcnt 0x0 11946; GFX12-NEXT: s_wait_kmcnt 0x0 11947; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 11948; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11949; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11950; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11951; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 11952; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 11953; GFX12-NEXT: s_mov_b32 s0, 0 11954; GFX12-NEXT: flat_load_b32 v3, v[0:1] 11955; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11956; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 11957; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11958; GFX12-NEXT: v_not_b32_e32 v5, v5 11959; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start 11960; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11961; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11962; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 11963; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11964; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11965; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 11966; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11967; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 11968; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 11969; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11970; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 11971; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11972; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 11973; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11974; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11975; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 11976; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 11977; GFX12-NEXT: s_wait_storecnt 0x0 11978; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11979; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11980; GFX12-NEXT: global_inv scope:SCOPE_DEV 11981; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11982; GFX12-NEXT: v_mov_b32_e32 v3, v2 11983; GFX12-NEXT: s_wait_alu 0xfffe 11984; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11985; GFX12-NEXT: s_wait_alu 0xfffe 11986; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11987; GFX12-NEXT: s_cbranch_execnz .LBB49_1 11988; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11989; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11990; GFX12-NEXT: s_wait_alu 0xfffe 11991; GFX12-NEXT: s_setpc_b64 s[30:31] 11992; 11993; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11994; GFX940: ; %bb.0: 11995; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11996; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11997; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11998; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11999; GFX940-NEXT: v_mov_b32_e32 v1, v5 12000; GFX940-NEXT: flat_load_dword v3, v[0:1] 12001; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 12002; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12003; GFX940-NEXT: s_mov_b32 s0, 0xffff 12004; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 12005; GFX940-NEXT: v_not_b32_e32 v5, v5 12006; GFX940-NEXT: s_mov_b64 s[0:1], 0 12007; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12008; GFX940-NEXT: s_movk_i32 s2, 0x7fff 12009; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start 12010; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12011; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12012; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12013; GFX940-NEXT: s_nop 0 12014; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 12015; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 12016; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 12017; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 12018; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12019; GFX940-NEXT: s_nop 1 12020; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12021; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12022; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 12023; GFX940-NEXT: buffer_wbl2 sc1 12024; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 12025; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12026; GFX940-NEXT: buffer_inv sc1 12027; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12028; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12029; GFX940-NEXT: v_mov_b32_e32 v3, v2 12030; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12031; GFX940-NEXT: s_cbranch_execnz .LBB49_1 12032; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12033; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12034; GFX940-NEXT: s_setpc_b64 s[30:31] 12035; 12036; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12037; GFX11: ; %bb.0: 12038; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12039; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 12040; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 12041; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 12043; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 12044; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 12045; GFX11-NEXT: s_mov_b32 s0, 0 12046; GFX11-NEXT: flat_load_b32 v3, v[0:1] 12047; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12048; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12049; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12050; GFX11-NEXT: v_not_b32_e32 v5, v5 12051; GFX11-NEXT: .p2align 6 12052; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start 12053; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12054; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12055; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12056; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12057; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12058; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 12059; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12060; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 12061; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 12062; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12063; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12064; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12065; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12066; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12067; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12068; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12069; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 12070; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12071; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 12072; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12073; GFX11-NEXT: buffer_gl1_inv 12074; GFX11-NEXT: buffer_gl0_inv 12075; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12076; GFX11-NEXT: v_mov_b32_e32 v3, v2 12077; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12078; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12079; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12080; GFX11-NEXT: s_cbranch_execnz .LBB49_1 12081; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12082; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12083; GFX11-NEXT: s_setpc_b64 s[30:31] 12084; 12085; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12086; GFX10: ; %bb.0: 12087; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12088; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 12089; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 12090; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12091; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 12092; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 12093; GFX10-NEXT: s_mov_b32 s4, 0 12094; GFX10-NEXT: flat_load_dword v3, v[0:1] 12095; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12096; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12097; GFX10-NEXT: v_not_b32_e32 v5, v5 12098; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start 12099; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12100; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12101; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12102; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 12103; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 12104; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 12105; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12106; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12107; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12108; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12109; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 12110; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12111; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12112; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12113; GFX10-NEXT: buffer_gl1_inv 12114; GFX10-NEXT: buffer_gl0_inv 12115; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12116; GFX10-NEXT: v_mov_b32_e32 v3, v2 12117; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12118; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12119; GFX10-NEXT: s_cbranch_execnz .LBB49_1 12120; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12121; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12122; GFX10-NEXT: s_setpc_b64 s[30:31] 12123; 12124; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12125; GFX90A: ; %bb.0: 12126; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12127; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 12128; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12129; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 12130; GFX90A-NEXT: flat_load_dword v3, v[0:1] 12131; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 12132; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12133; GFX90A-NEXT: s_mov_b32 s4, 0xffff 12134; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12135; GFX90A-NEXT: v_not_b32_e32 v5, v5 12136; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12137; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12138; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 12139; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start 12140; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12141; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12142; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12143; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 12144; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 12145; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 12146; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 12147; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12148; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12149; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12150; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 12151; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12152; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12153; GFX90A-NEXT: buffer_wbinvl1 12154; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12155; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12156; GFX90A-NEXT: v_mov_b32_e32 v3, v2 12157; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12158; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 12159; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12160; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12161; GFX90A-NEXT: s_setpc_b64 s[30:31] 12162; 12163; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12164; GFX908: ; %bb.0: 12165; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12166; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 12167; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12168; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 12169; GFX908-NEXT: flat_load_dword v3, v[0:1] 12170; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 12171; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12172; GFX908-NEXT: s_mov_b32 s4, 0xffff 12173; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12174; GFX908-NEXT: v_not_b32_e32 v5, v5 12175; GFX908-NEXT: s_mov_b64 s[4:5], 0 12176; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12177; GFX908-NEXT: s_movk_i32 s6, 0x7fff 12178; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start 12179; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12180; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12181; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12182; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 12183; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 12184; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 12185; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 12186; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12187; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12188; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12189; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 12190; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12191; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12192; GFX908-NEXT: buffer_wbinvl1 12193; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12194; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12195; GFX908-NEXT: v_mov_b32_e32 v3, v2 12196; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12197; GFX908-NEXT: s_cbranch_execnz .LBB49_1 12198; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12199; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12200; GFX908-NEXT: s_setpc_b64 s[30:31] 12201; 12202; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12203; GFX8: ; %bb.0: 12204; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12205; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 12206; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12207; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 12208; GFX8-NEXT: flat_load_dword v3, v[0:1] 12209; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 12210; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12211; GFX8-NEXT: s_mov_b32 s4, 0xffff 12212; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12213; GFX8-NEXT: v_not_b32_e32 v5, v5 12214; GFX8-NEXT: s_mov_b64 s[4:5], 0 12215; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12216; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start 12217; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12218; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12219; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12220; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 12221; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 12222; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 12223; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 12224; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 12225; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12226; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 12227; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 12228; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12229; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 12230; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12232; GFX8-NEXT: buffer_wbinvl1 12233; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12234; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12235; GFX8-NEXT: v_mov_b32_e32 v3, v2 12236; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12237; GFX8-NEXT: s_cbranch_execnz .LBB49_1 12238; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12239; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12240; GFX8-NEXT: s_setpc_b64 s[30:31] 12241; 12242; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12243; GFX7: ; %bb.0: 12244; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12245; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 12246; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12247; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 12248; GFX7-NEXT: flat_load_dword v3, v[0:1] 12249; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 12250; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12251; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 12252; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12253; GFX7-NEXT: v_not_b32_e32 v5, v5 12254; GFX7-NEXT: s_mov_b64 s[4:5], 0 12255; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 12256; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start 12257; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12258; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12259; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12260; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12261; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 12262; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12263; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 12264; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12265; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 12266; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12267; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12268; GFX7-NEXT: buffer_wbinvl1 12269; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12270; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12271; GFX7-NEXT: v_mov_b32_e32 v3, v2 12272; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12273; GFX7-NEXT: s_cbranch_execnz .LBB49_1 12274; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12275; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12276; GFX7-NEXT: s_setpc_b64 s[30:31] 12277 %gep = getelementptr bfloat, ptr %ptr, i64 1023 12278 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 12279 ret void 12280} 12281 12282define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 12283; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12284; GFX12: ; %bb.0: 12285; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12286; GFX12-NEXT: s_wait_expcnt 0x0 12287; GFX12-NEXT: s_wait_samplecnt 0x0 12288; GFX12-NEXT: s_wait_bvhcnt 0x0 12289; GFX12-NEXT: s_wait_kmcnt 0x0 12290; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 12291; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 12292; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12293; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 12294; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 12295; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 12296; GFX12-NEXT: s_mov_b32 s0, 0 12297; GFX12-NEXT: flat_load_b32 v3, v[0:1] 12298; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12299; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12300; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 12301; GFX12-NEXT: v_not_b32_e32 v5, v5 12302; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start 12303; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12304; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12305; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12306; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12307; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12308; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 12309; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12310; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 12311; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 12312; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12313; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12314; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12315; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12316; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12317; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12318; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12319; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 12320; GFX12-NEXT: s_wait_storecnt 0x0 12321; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12322; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12323; GFX12-NEXT: global_inv scope:SCOPE_DEV 12324; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12325; GFX12-NEXT: v_mov_b32_e32 v3, v2 12326; GFX12-NEXT: s_wait_alu 0xfffe 12327; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12328; GFX12-NEXT: s_wait_alu 0xfffe 12329; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12330; GFX12-NEXT: s_cbranch_execnz .LBB50_1 12331; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12332; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12333; GFX12-NEXT: s_wait_alu 0xfffe 12334; GFX12-NEXT: s_setpc_b64 s[30:31] 12335; 12336; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12337; GFX940: ; %bb.0: 12338; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12339; GFX940-NEXT: s_movk_i32 s0, 0xf800 12340; GFX940-NEXT: s_mov_b32 s1, -1 12341; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 12342; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 12343; GFX940-NEXT: v_mov_b32_e32 v1, v5 12344; GFX940-NEXT: flat_load_dword v3, v[0:1] 12345; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 12346; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12347; GFX940-NEXT: s_mov_b32 s0, 0xffff 12348; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 12349; GFX940-NEXT: v_not_b32_e32 v5, v5 12350; GFX940-NEXT: s_mov_b64 s[0:1], 0 12351; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12352; GFX940-NEXT: s_movk_i32 s2, 0x7fff 12353; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start 12354; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12355; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12356; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12357; GFX940-NEXT: s_nop 0 12358; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 12359; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 12360; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 12361; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 12362; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12363; GFX940-NEXT: s_nop 1 12364; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12365; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12366; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 12367; GFX940-NEXT: buffer_wbl2 sc1 12368; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 12369; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12370; GFX940-NEXT: buffer_inv sc1 12371; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12372; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12373; GFX940-NEXT: v_mov_b32_e32 v3, v2 12374; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12375; GFX940-NEXT: s_cbranch_execnz .LBB50_1 12376; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12377; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12378; GFX940-NEXT: s_setpc_b64 s[30:31] 12379; 12380; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12381; GFX11: ; %bb.0: 12382; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12383; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 12384; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 12385; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 12387; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 12388; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 12389; GFX11-NEXT: s_mov_b32 s0, 0 12390; GFX11-NEXT: flat_load_b32 v3, v[0:1] 12391; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12392; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12393; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12394; GFX11-NEXT: v_not_b32_e32 v5, v5 12395; GFX11-NEXT: .p2align 6 12396; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start 12397; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12398; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12399; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12401; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12402; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 12403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12404; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 12405; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 12406; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12407; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12408; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12409; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12410; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12411; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12412; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12413; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 12414; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12415; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 12416; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12417; GFX11-NEXT: buffer_gl1_inv 12418; GFX11-NEXT: buffer_gl0_inv 12419; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12420; GFX11-NEXT: v_mov_b32_e32 v3, v2 12421; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12422; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12423; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12424; GFX11-NEXT: s_cbranch_execnz .LBB50_1 12425; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12426; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12427; GFX11-NEXT: s_setpc_b64 s[30:31] 12428; 12429; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12430; GFX10: ; %bb.0: 12431; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12432; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 12433; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 12434; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12435; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 12436; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 12437; GFX10-NEXT: s_mov_b32 s4, 0 12438; GFX10-NEXT: flat_load_dword v3, v[0:1] 12439; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12440; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12441; GFX10-NEXT: v_not_b32_e32 v5, v5 12442; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start 12443; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12444; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12445; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12446; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 12447; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 12448; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 12449; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12450; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12451; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12452; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12453; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 12454; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12455; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12456; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12457; GFX10-NEXT: buffer_gl1_inv 12458; GFX10-NEXT: buffer_gl0_inv 12459; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12460; GFX10-NEXT: v_mov_b32_e32 v3, v2 12461; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12462; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12463; GFX10-NEXT: s_cbranch_execnz .LBB50_1 12464; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12465; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12466; GFX10-NEXT: s_setpc_b64 s[30:31] 12467; 12468; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12469; GFX90A: ; %bb.0: 12470; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12471; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 12472; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12473; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 12474; GFX90A-NEXT: flat_load_dword v3, v[0:1] 12475; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 12476; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12477; GFX90A-NEXT: s_mov_b32 s4, 0xffff 12478; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12479; GFX90A-NEXT: v_not_b32_e32 v5, v5 12480; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12481; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12482; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 12483; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start 12484; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12485; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12486; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12487; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 12488; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 12489; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 12490; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 12491; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12492; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12493; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12494; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 12495; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12496; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12497; GFX90A-NEXT: buffer_wbinvl1 12498; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12499; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12500; GFX90A-NEXT: v_mov_b32_e32 v3, v2 12501; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12502; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 12503; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12504; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12505; GFX90A-NEXT: s_setpc_b64 s[30:31] 12506; 12507; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12508; GFX908: ; %bb.0: 12509; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12510; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 12511; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12512; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 12513; GFX908-NEXT: flat_load_dword v3, v[0:1] 12514; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 12515; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12516; GFX908-NEXT: s_mov_b32 s4, 0xffff 12517; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12518; GFX908-NEXT: v_not_b32_e32 v5, v5 12519; GFX908-NEXT: s_mov_b64 s[4:5], 0 12520; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12521; GFX908-NEXT: s_movk_i32 s6, 0x7fff 12522; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start 12523; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12524; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12525; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12526; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 12527; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 12528; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 12529; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 12530; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12531; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12532; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12533; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 12534; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12535; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12536; GFX908-NEXT: buffer_wbinvl1 12537; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12538; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12539; GFX908-NEXT: v_mov_b32_e32 v3, v2 12540; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12541; GFX908-NEXT: s_cbranch_execnz .LBB50_1 12542; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12543; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12544; GFX908-NEXT: s_setpc_b64 s[30:31] 12545; 12546; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12547; GFX8: ; %bb.0: 12548; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12549; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 12550; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 12551; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 12552; GFX8-NEXT: flat_load_dword v3, v[0:1] 12553; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 12554; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12555; GFX8-NEXT: s_mov_b32 s4, 0xffff 12556; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12557; GFX8-NEXT: v_not_b32_e32 v5, v5 12558; GFX8-NEXT: s_mov_b64 s[4:5], 0 12559; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12560; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start 12561; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12562; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12563; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12564; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 12565; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 12566; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 12567; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 12568; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 12569; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12570; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 12571; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 12572; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12573; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 12574; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12575; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12576; GFX8-NEXT: buffer_wbinvl1 12577; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12578; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12579; GFX8-NEXT: v_mov_b32_e32 v3, v2 12580; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12581; GFX8-NEXT: s_cbranch_execnz .LBB50_1 12582; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12583; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12584; GFX8-NEXT: s_setpc_b64 s[30:31] 12585; 12586; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 12587; GFX7: ; %bb.0: 12588; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12589; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 12590; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 12591; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 12592; GFX7-NEXT: flat_load_dword v3, v[0:1] 12593; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 12594; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12595; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 12596; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12597; GFX7-NEXT: v_not_b32_e32 v5, v5 12598; GFX7-NEXT: s_mov_b64 s[4:5], 0 12599; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 12600; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start 12601; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12602; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12603; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12604; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12605; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 12606; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12607; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 12608; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12609; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 12610; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12611; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12612; GFX7-NEXT: buffer_wbinvl1 12613; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12614; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12615; GFX7-NEXT: v_mov_b32_e32 v3, v2 12616; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12617; GFX7-NEXT: s_cbranch_execnz .LBB50_1 12618; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12619; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12620; GFX7-NEXT: s_setpc_b64 s[30:31] 12621 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 12622 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 12623 ret void 12624} 12625 12626define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 12627; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12628; GFX12: ; %bb.0: 12629; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12630; GFX12-NEXT: s_wait_expcnt 0x0 12631; GFX12-NEXT: s_wait_samplecnt 0x0 12632; GFX12-NEXT: s_wait_bvhcnt 0x0 12633; GFX12-NEXT: s_wait_kmcnt 0x0 12634; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 12635; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12636; GFX12-NEXT: s_mov_b32 s0, 0 12637; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start 12638; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12639; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12640; GFX12-NEXT: v_mov_b32_e32 v4, v3 12641; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12642; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 12643; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 12644; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12645; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 12646; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 12647; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 12648; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 12649; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12650; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 12651; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12652; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 12653; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 12654; GFX12-NEXT: s_wait_storecnt 0x0 12655; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12656; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12657; GFX12-NEXT: global_inv scope:SCOPE_DEV 12658; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12659; GFX12-NEXT: s_wait_alu 0xfffe 12660; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12661; GFX12-NEXT: s_wait_alu 0xfffe 12662; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12663; GFX12-NEXT: s_cbranch_execnz .LBB51_1 12664; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12665; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12666; GFX12-NEXT: v_mov_b32_e32 v0, v3 12667; GFX12-NEXT: s_wait_alu 0xfffe 12668; GFX12-NEXT: s_setpc_b64 s[30:31] 12669; 12670; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12671; GFX940: ; %bb.0: 12672; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12673; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 12674; GFX940-NEXT: s_mov_b64 s[0:1], 0 12675; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12676; GFX940-NEXT: s_movk_i32 s2, 0x7fff 12677; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 12678; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start 12679; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12680; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12681; GFX940-NEXT: v_mov_b32_e32 v5, v3 12682; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 12683; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 12684; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 12685; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 12686; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 12687; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 12688; GFX940-NEXT: s_nop 1 12689; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 12690; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12691; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 12692; GFX940-NEXT: buffer_wbl2 sc1 12693; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 12694; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12695; GFX940-NEXT: buffer_inv sc1 12696; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12697; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12698; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12699; GFX940-NEXT: s_cbranch_execnz .LBB51_1 12700; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12701; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12702; GFX940-NEXT: v_mov_b32_e32 v0, v3 12703; GFX940-NEXT: s_setpc_b64 s[30:31] 12704; 12705; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12706; GFX11: ; %bb.0: 12707; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12708; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 12709; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12710; GFX11-NEXT: s_mov_b32 s0, 0 12711; GFX11-NEXT: .p2align 6 12712; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start 12713; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12714; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12715; GFX11-NEXT: v_mov_b32_e32 v4, v3 12716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12717; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 12718; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 12719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12720; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 12721; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 12722; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 12723; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 12724; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12725; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 12726; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12727; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12728; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 12729; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12730; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 12731; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12732; GFX11-NEXT: buffer_gl1_inv 12733; GFX11-NEXT: buffer_gl0_inv 12734; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12735; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12736; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12737; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12738; GFX11-NEXT: s_cbranch_execnz .LBB51_1 12739; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12740; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12741; GFX11-NEXT: v_mov_b32_e32 v0, v3 12742; GFX11-NEXT: s_setpc_b64 s[30:31] 12743; 12744; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12745; GFX10: ; %bb.0: 12746; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12747; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 12748; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 12749; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 12750; GFX10-NEXT: s_mov_b32 s4, 0 12751; GFX10-NEXT: flat_load_dword v0, v[3:4] 12752; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start 12753; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12754; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12755; GFX10-NEXT: v_mov_b32_e32 v6, v0 12756; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12757; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 12758; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 12759; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 12760; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12761; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 12762; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo 12763; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12764; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 12765; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12766; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12767; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12768; GFX10-NEXT: buffer_gl1_inv 12769; GFX10-NEXT: buffer_gl0_inv 12770; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 12771; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12772; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12773; GFX10-NEXT: s_cbranch_execnz .LBB51_1 12774; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12775; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12776; GFX10-NEXT: s_setpc_b64 s[30:31] 12777; 12778; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12779; GFX90A: ; %bb.0: 12780; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12781; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 12782; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12783; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12784; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 12785; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 12786; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start 12787; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12788; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12789; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12790; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 12791; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 12792; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 12793; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 12794; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 12795; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 12796; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 12797; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12798; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 12799; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 12800; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12801; GFX90A-NEXT: buffer_wbinvl1 12802; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12803; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12804; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12805; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 12806; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12807; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12808; GFX90A-NEXT: v_mov_b32_e32 v0, v3 12809; GFX90A-NEXT: s_setpc_b64 s[30:31] 12810; 12811; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12812; GFX908: ; %bb.0: 12813; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12814; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 12815; GFX908-NEXT: s_mov_b64 s[4:5], 0 12816; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12817; GFX908-NEXT: s_movk_i32 s6, 0x7fff 12818; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 12819; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start 12820; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12821; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12822; GFX908-NEXT: v_mov_b32_e32 v4, v3 12823; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 12824; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 12825; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 12826; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 12827; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 12828; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 12829; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 12830; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12831; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 12832; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 12833; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12834; GFX908-NEXT: buffer_wbinvl1 12835; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12836; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12837; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12838; GFX908-NEXT: s_cbranch_execnz .LBB51_1 12839; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12840; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12841; GFX908-NEXT: v_mov_b32_e32 v0, v3 12842; GFX908-NEXT: s_setpc_b64 s[30:31] 12843; 12844; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12845; GFX8: ; %bb.0: 12846; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12847; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 12848; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 12849; GFX8-NEXT: flat_load_dword v0, v[3:4] 12850; GFX8-NEXT: s_mov_b64 s[4:5], 0 12851; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 12852; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start 12853; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12854; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12855; GFX8-NEXT: v_mov_b32_e32 v6, v0 12856; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12857; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 12858; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 12859; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 12860; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 12861; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 12862; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12863; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 12864; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc 12865; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12866; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12867; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12868; GFX8-NEXT: buffer_wbinvl1 12869; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 12870; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12871; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12872; GFX8-NEXT: s_cbranch_execnz .LBB51_1 12873; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12874; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12875; GFX8-NEXT: s_setpc_b64 s[30:31] 12876; 12877; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 12878; GFX7: ; %bb.0: 12879; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12880; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 12881; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12882; GFX7-NEXT: flat_load_dword v3, v[0:1] 12883; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12884; GFX7-NEXT: s_mov_b64 s[4:5], 0 12885; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12886; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start 12887; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12888; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12889; GFX7-NEXT: v_mov_b32_e32 v4, v3 12890; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 12891; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 12892; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 12893; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 12894; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 12895; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12896; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12897; GFX7-NEXT: buffer_wbinvl1 12898; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12899; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12900; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12901; GFX7-NEXT: s_cbranch_execnz .LBB51_1 12902; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12903; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12904; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 12905; GFX7-NEXT: s_setpc_b64 s[30:31] 12906 %gep = getelementptr bfloat, ptr %ptr, i64 1023 12907 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 12908 ret bfloat %result 12909} 12910 12911define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 12912; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 12913; GFX12: ; %bb.0: 12914; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12915; GFX12-NEXT: s_wait_expcnt 0x0 12916; GFX12-NEXT: s_wait_samplecnt 0x0 12917; GFX12-NEXT: s_wait_bvhcnt 0x0 12918; GFX12-NEXT: s_wait_kmcnt 0x0 12919; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 12920; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 12921; GFX12-NEXT: s_mov_b32 s0, 0 12922; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start 12923; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12924; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12925; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 12926; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12927; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 12928; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 12929; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 12930; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12931; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 12932; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 12933; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 12934; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12935; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12936; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 12937; GFX12-NEXT: s_wait_storecnt 0x0 12938; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12939; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12940; GFX12-NEXT: global_inv scope:SCOPE_DEV 12941; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12942; GFX12-NEXT: v_mov_b32_e32 v3, v2 12943; GFX12-NEXT: s_wait_alu 0xfffe 12944; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12945; GFX12-NEXT: s_wait_alu 0xfffe 12946; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12947; GFX12-NEXT: s_cbranch_execnz .LBB52_1 12948; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12949; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12950; GFX12-NEXT: s_wait_alu 0xfffe 12951; GFX12-NEXT: s_setpc_b64 s[30:31] 12952; 12953; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 12954; GFX940: ; %bb.0: 12955; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12956; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 12957; GFX940-NEXT: s_mov_b64 s[0:1], 0 12958; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 12959; GFX940-NEXT: s_movk_i32 s2, 0x7fff 12960; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 12961; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start 12962; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12963; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12964; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 12965; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 12966; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 12967; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 12968; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 12969; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12970; GFX940-NEXT: s_nop 1 12971; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 12972; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12973; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 12974; GFX940-NEXT: buffer_wbl2 sc1 12975; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 12976; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12977; GFX940-NEXT: buffer_inv sc1 12978; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12979; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12980; GFX940-NEXT: v_mov_b32_e32 v3, v2 12981; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12982; GFX940-NEXT: s_cbranch_execnz .LBB52_1 12983; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12984; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12985; GFX940-NEXT: s_setpc_b64 s[30:31] 12986; 12987; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 12988; GFX11: ; %bb.0: 12989; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12990; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 12991; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 12992; GFX11-NEXT: s_mov_b32 s0, 0 12993; GFX11-NEXT: .p2align 6 12994; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start 12995; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12996; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12997; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 12998; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12999; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 13000; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 13001; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 13002; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 13003; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 13004; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 13005; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 13006; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13007; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13008; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 13009; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13010; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc 13011; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13012; GFX11-NEXT: buffer_gl1_inv 13013; GFX11-NEXT: buffer_gl0_inv 13014; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13015; GFX11-NEXT: v_mov_b32_e32 v3, v2 13016; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13017; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13018; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13019; GFX11-NEXT: s_cbranch_execnz .LBB52_1 13020; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13021; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13022; GFX11-NEXT: s_setpc_b64 s[30:31] 13023; 13024; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 13025; GFX10: ; %bb.0: 13026; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13027; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 13028; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13029; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13030; GFX10-NEXT: s_mov_b32 s4, 0 13031; GFX10-NEXT: flat_load_dword v3, v[0:1] 13032; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start 13033; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13034; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13035; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 13036; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 13037; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 13038; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 13039; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 13040; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 13041; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 13042; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13043; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 13044; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13045; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13046; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13047; GFX10-NEXT: buffer_gl1_inv 13048; GFX10-NEXT: buffer_gl0_inv 13049; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13050; GFX10-NEXT: v_mov_b32_e32 v3, v2 13051; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13052; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13053; GFX10-NEXT: s_cbranch_execnz .LBB52_1 13054; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13055; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13056; GFX10-NEXT: s_setpc_b64 s[30:31] 13057; 13058; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 13059; GFX90A: ; %bb.0: 13060; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13061; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 13062; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13063; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13064; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 13065; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 13066; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start 13067; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13068; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13069; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 13070; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 13071; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 13072; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 13073; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 13074; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13075; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 13076; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13077; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 13078; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 13079; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13080; GFX90A-NEXT: buffer_wbinvl1 13081; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13082; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13083; GFX90A-NEXT: v_mov_b32_e32 v3, v2 13084; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13085; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 13086; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13087; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13088; GFX90A-NEXT: s_setpc_b64 s[30:31] 13089; 13090; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 13091; GFX908: ; %bb.0: 13092; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13093; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 13094; GFX908-NEXT: s_mov_b64 s[4:5], 0 13095; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13096; GFX908-NEXT: s_movk_i32 s6, 0x7fff 13097; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 13098; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start 13099; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13100; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13101; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 13102; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 13103; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 13104; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 13105; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 13106; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13107; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 13108; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13109; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 13110; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 13111; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13112; GFX908-NEXT: buffer_wbinvl1 13113; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13114; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13115; GFX908-NEXT: v_mov_b32_e32 v3, v2 13116; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13117; GFX908-NEXT: s_cbranch_execnz .LBB52_1 13118; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13119; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13120; GFX908-NEXT: s_setpc_b64 s[30:31] 13121; 13122; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 13123; GFX8: ; %bb.0: 13124; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13125; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 13126; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13127; GFX8-NEXT: flat_load_dword v3, v[0:1] 13128; GFX8-NEXT: s_mov_b64 s[4:5], 0 13129; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13130; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start 13131; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13132; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13133; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 13134; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 13135; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 13136; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 13137; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 13138; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 13139; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13140; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 13141; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 13142; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13143; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13144; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13145; GFX8-NEXT: buffer_wbinvl1 13146; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13147; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13148; GFX8-NEXT: v_mov_b32_e32 v3, v2 13149; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13150; GFX8-NEXT: s_cbranch_execnz .LBB52_1 13151; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13152; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13153; GFX8-NEXT: s_setpc_b64 s[30:31] 13154; 13155; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 13156; GFX7: ; %bb.0: 13157; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13158; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 13159; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13160; GFX7-NEXT: flat_load_dword v3, v[0:1] 13161; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13162; GFX7-NEXT: s_mov_b64 s[4:5], 0 13163; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 13164; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start 13165; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13166; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13167; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 13168; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 13169; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 13170; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13171; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 13172; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13173; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13174; GFX7-NEXT: buffer_wbinvl1 13175; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13176; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13177; GFX7-NEXT: v_mov_b32_e32 v3, v2 13178; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13179; GFX7-NEXT: s_cbranch_execnz .LBB52_1 13180; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13181; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13182; GFX7-NEXT: s_setpc_b64 s[30:31] 13183 %gep = getelementptr bfloat, ptr %ptr, i64 1023 13184 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 13185 ret void 13186} 13187 13188define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 13189; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13190; GFX12: ; %bb.0: 13191; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13192; GFX12-NEXT: s_wait_expcnt 0x0 13193; GFX12-NEXT: s_wait_samplecnt 0x0 13194; GFX12-NEXT: s_wait_bvhcnt 0x0 13195; GFX12-NEXT: s_wait_kmcnt 0x0 13196; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 13197; GFX12-NEXT: s_mov_b32 s0, 0 13198; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 13199; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 13200; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 13201; GFX12-NEXT: flat_load_b32 v4, v[0:1] 13202; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13203; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 13204; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13205; GFX12-NEXT: v_not_b32_e32 v6, v3 13206; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start 13207; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13208; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13209; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 13210; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13211; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13212; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 13213; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 13214; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 13215; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 13216; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13217; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13218; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13219; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 13220; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 13221; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13222; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 13223; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 13224; GFX12-NEXT: s_wait_storecnt 0x0 13225; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13226; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13227; GFX12-NEXT: global_inv scope:SCOPE_DEV 13228; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13229; GFX12-NEXT: v_mov_b32_e32 v4, v3 13230; GFX12-NEXT: s_wait_alu 0xfffe 13231; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13232; GFX12-NEXT: s_wait_alu 0xfffe 13233; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13234; GFX12-NEXT: s_cbranch_execnz .LBB53_1 13235; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13236; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13237; GFX12-NEXT: s_wait_alu 0xfffe 13238; GFX12-NEXT: s_setpc_b64 s[30:31] 13239; 13240; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13241; GFX940: ; %bb.0: 13242; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13243; GFX940-NEXT: v_mov_b32_e32 v3, v0 13244; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 13245; GFX940-NEXT: flat_load_dword v5, v[0:1] 13246; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 13247; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13248; GFX940-NEXT: s_mov_b32 s0, 0xffff 13249; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 13250; GFX940-NEXT: v_not_b32_e32 v6, v4 13251; GFX940-NEXT: s_mov_b64 s[0:1], 0 13252; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13253; GFX940-NEXT: s_movk_i32 s2, 0x7fff 13254; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start 13255; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13256; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13257; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13258; GFX940-NEXT: s_nop 0 13259; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 13260; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 13261; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 13262; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 13263; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13264; GFX940-NEXT: s_nop 1 13265; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 13266; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13267; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 13268; GFX940-NEXT: buffer_wbl2 sc1 13269; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 13270; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13271; GFX940-NEXT: buffer_inv sc1 13272; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 13273; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13274; GFX940-NEXT: v_mov_b32_e32 v5, v4 13275; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13276; GFX940-NEXT: s_cbranch_execnz .LBB53_1 13277; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13278; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13279; GFX940-NEXT: s_setpc_b64 s[30:31] 13280; 13281; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13282; GFX11: ; %bb.0: 13283; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13284; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 13285; GFX11-NEXT: s_mov_b32 s0, 0 13286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 13287; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 13288; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 13289; GFX11-NEXT: flat_load_b32 v4, v[0:1] 13290; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13291; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 13292; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13293; GFX11-NEXT: v_not_b32_e32 v6, v3 13294; GFX11-NEXT: .p2align 6 13295; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start 13296; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13297; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13298; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 13299; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13300; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13301; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 13302; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 13303; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 13304; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 13305; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13306; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13307; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13308; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 13309; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 13310; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13311; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 13312; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 13313; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13314; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 13315; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13316; GFX11-NEXT: buffer_gl1_inv 13317; GFX11-NEXT: buffer_gl0_inv 13318; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13319; GFX11-NEXT: v_mov_b32_e32 v4, v3 13320; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13321; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13322; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13323; GFX11-NEXT: s_cbranch_execnz .LBB53_1 13324; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13325; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13326; GFX11-NEXT: s_setpc_b64 s[30:31] 13327; 13328; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13329; GFX10: ; %bb.0: 13330; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13331; GFX10-NEXT: v_mov_b32_e32 v3, v0 13332; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13333; GFX10-NEXT: s_mov_b32 s4, 0 13334; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 13335; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 13336; GFX10-NEXT: flat_load_dword v4, v[0:1] 13337; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13338; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 13339; GFX10-NEXT: v_not_b32_e32 v6, v3 13340; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start 13341; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13342; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13343; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13344; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 13345; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 13346; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 13347; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13348; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13349; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 13350; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13351; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 13352; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13353; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13354; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13355; GFX10-NEXT: buffer_gl1_inv 13356; GFX10-NEXT: buffer_gl0_inv 13357; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13358; GFX10-NEXT: v_mov_b32_e32 v4, v3 13359; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13360; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13361; GFX10-NEXT: s_cbranch_execnz .LBB53_1 13362; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13363; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13364; GFX10-NEXT: s_setpc_b64 s[30:31] 13365; 13366; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13367; GFX90A: ; %bb.0: 13368; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13369; GFX90A-NEXT: v_mov_b32_e32 v3, v0 13370; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 13371; GFX90A-NEXT: flat_load_dword v5, v[0:1] 13372; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 13373; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13374; GFX90A-NEXT: s_mov_b32 s4, 0xffff 13375; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 13376; GFX90A-NEXT: v_not_b32_e32 v6, v4 13377; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13378; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13379; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 13380; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start 13381; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13382; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13383; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13384; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 13385; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 13386; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 13387; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 13388; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13389; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 13390; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13391; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 13392; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 13393; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13394; GFX90A-NEXT: buffer_wbinvl1 13395; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 13396; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13397; GFX90A-NEXT: v_mov_b32_e32 v5, v4 13398; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13399; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 13400; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13401; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13402; GFX90A-NEXT: s_setpc_b64 s[30:31] 13403; 13404; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13405; GFX908: ; %bb.0: 13406; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13407; GFX908-NEXT: v_mov_b32_e32 v3, v0 13408; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 13409; GFX908-NEXT: flat_load_dword v4, v[0:1] 13410; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 13411; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13412; GFX908-NEXT: s_mov_b32 s4, 0xffff 13413; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 13414; GFX908-NEXT: v_not_b32_e32 v6, v3 13415; GFX908-NEXT: s_mov_b64 s[4:5], 0 13416; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13417; GFX908-NEXT: s_movk_i32 s6, 0x7fff 13418; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start 13419; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13420; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13421; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13422; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 13423; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 13424; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 13425; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 13426; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13427; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 13428; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13429; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 13430; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13431; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13432; GFX908-NEXT: buffer_wbinvl1 13433; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13434; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13435; GFX908-NEXT: v_mov_b32_e32 v4, v3 13436; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13437; GFX908-NEXT: s_cbranch_execnz .LBB53_1 13438; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13439; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13440; GFX908-NEXT: s_setpc_b64 s[30:31] 13441; 13442; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13443; GFX8: ; %bb.0: 13444; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13445; GFX8-NEXT: v_mov_b32_e32 v3, v0 13446; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 13447; GFX8-NEXT: flat_load_dword v4, v[0:1] 13448; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 13449; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13450; GFX8-NEXT: s_mov_b32 s4, 0xffff 13451; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 13452; GFX8-NEXT: v_not_b32_e32 v6, v3 13453; GFX8-NEXT: s_mov_b64 s[4:5], 0 13454; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13455; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start 13456; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13457; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13458; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13459; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 13460; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 13461; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 13462; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 13463; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 13464; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13465; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc 13466; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 13467; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13468; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 13469; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13470; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13471; GFX8-NEXT: buffer_wbinvl1 13472; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13473; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13474; GFX8-NEXT: v_mov_b32_e32 v4, v3 13475; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13476; GFX8-NEXT: s_cbranch_execnz .LBB53_1 13477; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13478; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13479; GFX8-NEXT: s_setpc_b64 s[30:31] 13480; 13481; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: 13482; GFX7: ; %bb.0: 13483; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13484; GFX7-NEXT: v_mov_b32_e32 v3, v0 13485; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 13486; GFX7-NEXT: flat_load_dword v4, v[0:1] 13487; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 13488; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 13489; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 13490; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13491; GFX7-NEXT: v_not_b32_e32 v6, v3 13492; GFX7-NEXT: s_mov_b64 s[4:5], 0 13493; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13494; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start 13495; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13496; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13497; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 13498; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13499; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 13500; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 13501; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 13502; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 13503; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 13504; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13505; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13506; GFX7-NEXT: buffer_wbinvl1 13507; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13508; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13509; GFX7-NEXT: v_mov_b32_e32 v4, v3 13510; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13511; GFX7-NEXT: s_cbranch_execnz .LBB53_1 13512; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13513; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13514; GFX7-NEXT: s_setpc_b64 s[30:31] 13515 %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 13516 ret void 13517} 13518 13519define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 13520; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13521; GFX12: ; %bb.0: 13522; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13523; GFX12-NEXT: s_wait_expcnt 0x0 13524; GFX12-NEXT: s_wait_samplecnt 0x0 13525; GFX12-NEXT: s_wait_bvhcnt 0x0 13526; GFX12-NEXT: s_wait_kmcnt 0x0 13527; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 13528; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13529; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13530; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 13531; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 13532; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 13533; GFX12-NEXT: s_mov_b32 s0, 0 13534; GFX12-NEXT: flat_load_b32 v5, v[0:1] 13535; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13536; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 13537; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13538; GFX12-NEXT: v_not_b32_e32 v4, v4 13539; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start 13540; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13541; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13542; GFX12-NEXT: v_mov_b32_e32 v6, v5 13543; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13544; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 13545; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13546; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13547; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 13548; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 13549; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 13550; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13551; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 13552; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 13553; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 13554; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13555; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 13556; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 13557; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13558; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 13559; GFX12-NEXT: global_wb scope:SCOPE_SYS 13560; GFX12-NEXT: s_wait_storecnt 0x0 13561; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13562; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13563; GFX12-NEXT: global_inv scope:SCOPE_SYS 13564; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 13565; GFX12-NEXT: s_wait_alu 0xfffe 13566; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13567; GFX12-NEXT: s_wait_alu 0xfffe 13568; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13569; GFX12-NEXT: s_cbranch_execnz .LBB54_1 13570; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13571; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13572; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13573; GFX12-NEXT: s_wait_alu 0xfffe 13574; GFX12-NEXT: s_setpc_b64 s[30:31] 13575; 13576; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13577; GFX940: ; %bb.0: 13578; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13579; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 13580; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 13581; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 13582; GFX940-NEXT: v_mov_b32_e32 v1, v5 13583; GFX940-NEXT: flat_load_dword v5, v[0:1] 13584; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 13585; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13586; GFX940-NEXT: s_mov_b32 s0, 0xffff 13587; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 13588; GFX940-NEXT: v_not_b32_e32 v4, v4 13589; GFX940-NEXT: s_mov_b64 s[0:1], 0 13590; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13591; GFX940-NEXT: s_movk_i32 s2, 0x7fff 13592; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start 13593; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13594; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13595; GFX940-NEXT: v_mov_b32_e32 v7, v5 13596; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13597; GFX940-NEXT: s_nop 0 13598; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 13599; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 13600; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 13601; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 13602; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13603; GFX940-NEXT: s_nop 1 13604; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 13605; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13606; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 13607; GFX940-NEXT: buffer_wbl2 sc0 sc1 13608; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 13609; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13610; GFX940-NEXT: buffer_inv sc0 sc1 13611; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 13612; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13613; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13614; GFX940-NEXT: s_cbranch_execnz .LBB54_1 13615; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13616; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13617; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13618; GFX940-NEXT: s_setpc_b64 s[30:31] 13619; 13620; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13621; GFX11: ; %bb.0: 13622; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13623; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 13624; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13625; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13626; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 13627; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 13628; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 13629; GFX11-NEXT: s_mov_b32 s0, 0 13630; GFX11-NEXT: flat_load_b32 v5, v[0:1] 13631; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13632; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 13633; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13634; GFX11-NEXT: v_not_b32_e32 v4, v4 13635; GFX11-NEXT: .p2align 6 13636; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start 13637; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13638; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13639; GFX11-NEXT: v_mov_b32_e32 v6, v5 13640; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13641; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 13642; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13643; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13644; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 13645; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 13646; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 13647; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13648; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 13649; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 13650; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 13651; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13652; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 13653; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 13654; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13655; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 13656; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13657; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 13658; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13659; GFX11-NEXT: buffer_gl1_inv 13660; GFX11-NEXT: buffer_gl0_inv 13661; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 13662; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13663; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13664; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13665; GFX11-NEXT: s_cbranch_execnz .LBB54_1 13666; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13667; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13668; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13669; GFX11-NEXT: s_setpc_b64 s[30:31] 13670; 13671; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13672; GFX10: ; %bb.0: 13673; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13674; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 13675; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13676; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13677; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 13678; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 13679; GFX10-NEXT: s_mov_b32 s4, 0 13680; GFX10-NEXT: flat_load_dword v5, v[0:1] 13681; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13682; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 13683; GFX10-NEXT: v_not_b32_e32 v4, v4 13684; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start 13685; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13686; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13687; GFX10-NEXT: v_mov_b32_e32 v6, v5 13688; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13689; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 13690; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 13691; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 13692; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13693; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 13694; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 13695; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13696; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 13697; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13698; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 13699; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13700; GFX10-NEXT: buffer_gl1_inv 13701; GFX10-NEXT: buffer_gl0_inv 13702; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 13703; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13704; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13705; GFX10-NEXT: s_cbranch_execnz .LBB54_1 13706; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13707; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13708; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13709; GFX10-NEXT: s_setpc_b64 s[30:31] 13710; 13711; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13712; GFX90A: ; %bb.0: 13713; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13714; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 13715; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13716; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 13717; GFX90A-NEXT: flat_load_dword v5, v[0:1] 13718; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 13719; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13720; GFX90A-NEXT: s_mov_b32 s4, 0xffff 13721; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 13722; GFX90A-NEXT: v_not_b32_e32 v4, v4 13723; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13724; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13725; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 13726; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start 13727; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13728; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13729; GFX90A-NEXT: v_mov_b32_e32 v7, v5 13730; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13731; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 13732; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 13733; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 13734; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 13735; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13736; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 13737; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13738; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 13739; GFX90A-NEXT: buffer_wbl2 13740; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 13741; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13742; GFX90A-NEXT: buffer_invl2 13743; GFX90A-NEXT: buffer_wbinvl1 13744; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 13745; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13746; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13747; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 13748; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13749; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13750; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13751; GFX90A-NEXT: s_setpc_b64 s[30:31] 13752; 13753; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13754; GFX908: ; %bb.0: 13755; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13756; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 13757; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13758; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 13759; GFX908-NEXT: flat_load_dword v5, v[0:1] 13760; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 13761; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13762; GFX908-NEXT: s_mov_b32 s4, 0xffff 13763; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 13764; GFX908-NEXT: v_not_b32_e32 v4, v4 13765; GFX908-NEXT: s_mov_b64 s[4:5], 0 13766; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13767; GFX908-NEXT: s_movk_i32 s6, 0x7fff 13768; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start 13769; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13770; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13771; GFX908-NEXT: v_mov_b32_e32 v6, v5 13772; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13773; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 13774; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 13775; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 13776; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 13777; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13778; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 13779; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13780; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 13781; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 13782; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13783; GFX908-NEXT: buffer_wbinvl1 13784; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 13785; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13786; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13787; GFX908-NEXT: s_cbranch_execnz .LBB54_1 13788; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13789; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13790; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13791; GFX908-NEXT: s_setpc_b64 s[30:31] 13792; 13793; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13794; GFX8: ; %bb.0: 13795; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13796; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 13797; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13798; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 13799; GFX8-NEXT: flat_load_dword v5, v[0:1] 13800; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 13801; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13802; GFX8-NEXT: s_mov_b32 s4, 0xffff 13803; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 13804; GFX8-NEXT: v_not_b32_e32 v4, v4 13805; GFX8-NEXT: s_mov_b64 s[4:5], 0 13806; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13807; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start 13808; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13809; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13810; GFX8-NEXT: v_mov_b32_e32 v6, v5 13811; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13812; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 13813; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 13814; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 13815; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 13816; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 13817; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13818; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 13819; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 13820; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13821; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 13822; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 13823; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13824; GFX8-NEXT: buffer_wbinvl1 13825; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 13826; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13827; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13828; GFX8-NEXT: s_cbranch_execnz .LBB54_1 13829; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13830; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13831; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13832; GFX8-NEXT: s_setpc_b64 s[30:31] 13833; 13834; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13835; GFX7: ; %bb.0: 13836; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13837; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 13838; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13839; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 13840; GFX7-NEXT: flat_load_dword v5, v[0:1] 13841; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 13842; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 13843; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 13844; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13845; GFX7-NEXT: v_not_b32_e32 v4, v4 13846; GFX7-NEXT: s_mov_b64 s[4:5], 0 13847; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13848; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start 13849; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13850; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13851; GFX7-NEXT: v_mov_b32_e32 v6, v5 13852; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 13853; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13854; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 13855; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 13856; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 13857; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 13858; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 13859; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 13860; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13861; GFX7-NEXT: buffer_wbinvl1 13862; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 13863; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13864; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13865; GFX7-NEXT: s_cbranch_execnz .LBB54_1 13866; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13867; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13868; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 13869; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13870; GFX7-NEXT: s_setpc_b64 s[30:31] 13871 %gep = getelementptr bfloat, ptr %ptr, i64 1023 13872 %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 13873 ret bfloat %result 13874} 13875 13876define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 13877; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13878; GFX12: ; %bb.0: 13879; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13880; GFX12-NEXT: s_wait_expcnt 0x0 13881; GFX12-NEXT: s_wait_samplecnt 0x0 13882; GFX12-NEXT: s_wait_bvhcnt 0x0 13883; GFX12-NEXT: s_wait_kmcnt 0x0 13884; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 13885; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13886; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13887; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 13888; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 13889; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 13890; GFX12-NEXT: s_mov_b32 s0, 0 13891; GFX12-NEXT: flat_load_b32 v3, v[0:1] 13892; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 13893; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 13894; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13895; GFX12-NEXT: v_not_b32_e32 v5, v5 13896; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start 13897; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13898; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13899; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 13900; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13901; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13902; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 13903; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 13904; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 13905; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 13906; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 13907; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 13908; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13909; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 13910; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 13911; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13912; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 13913; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 13914; GFX12-NEXT: global_wb scope:SCOPE_SYS 13915; GFX12-NEXT: s_wait_storecnt 0x0 13916; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13917; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13918; GFX12-NEXT: global_inv scope:SCOPE_SYS 13919; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13920; GFX12-NEXT: v_mov_b32_e32 v3, v2 13921; GFX12-NEXT: s_wait_alu 0xfffe 13922; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13923; GFX12-NEXT: s_wait_alu 0xfffe 13924; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13925; GFX12-NEXT: s_cbranch_execnz .LBB55_1 13926; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13927; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13928; GFX12-NEXT: s_wait_alu 0xfffe 13929; GFX12-NEXT: s_setpc_b64 s[30:31] 13930; 13931; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13932; GFX940: ; %bb.0: 13933; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13934; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 13935; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 13936; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 13937; GFX940-NEXT: v_mov_b32_e32 v1, v5 13938; GFX940-NEXT: flat_load_dword v3, v[0:1] 13939; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 13940; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 13941; GFX940-NEXT: s_mov_b32 s0, 0xffff 13942; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 13943; GFX940-NEXT: v_not_b32_e32 v5, v5 13944; GFX940-NEXT: s_mov_b64 s[0:1], 0 13945; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13946; GFX940-NEXT: s_movk_i32 s2, 0x7fff 13947; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start 13948; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13949; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13950; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13951; GFX940-NEXT: s_nop 0 13952; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 13953; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 13954; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 13955; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 13956; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13957; GFX940-NEXT: s_nop 1 13958; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 13959; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13960; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 13961; GFX940-NEXT: buffer_wbl2 sc0 sc1 13962; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 13963; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13964; GFX940-NEXT: buffer_inv sc0 sc1 13965; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13966; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13967; GFX940-NEXT: v_mov_b32_e32 v3, v2 13968; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13969; GFX940-NEXT: s_cbranch_execnz .LBB55_1 13970; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13971; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13972; GFX940-NEXT: s_setpc_b64 s[30:31] 13973; 13974; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 13975; GFX11: ; %bb.0: 13976; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13977; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 13978; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13979; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13980; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 13981; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 13982; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 13983; GFX11-NEXT: s_mov_b32 s0, 0 13984; GFX11-NEXT: flat_load_b32 v3, v[0:1] 13985; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 13986; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 13987; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13988; GFX11-NEXT: v_not_b32_e32 v5, v5 13989; GFX11-NEXT: .p2align 6 13990; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start 13991; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13992; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13993; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 13994; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13995; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 13996; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 13997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 13998; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 13999; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 14000; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 14001; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 14002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14003; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 14004; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14005; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14006; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 14007; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 14008; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14009; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 14010; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14011; GFX11-NEXT: buffer_gl1_inv 14012; GFX11-NEXT: buffer_gl0_inv 14013; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14014; GFX11-NEXT: v_mov_b32_e32 v3, v2 14015; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14016; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14017; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14018; GFX11-NEXT: s_cbranch_execnz .LBB55_1 14019; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14020; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14021; GFX11-NEXT: s_setpc_b64 s[30:31] 14022; 14023; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14024; GFX10: ; %bb.0: 14025; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14026; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 14027; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 14028; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 14029; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 14030; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 14031; GFX10-NEXT: s_mov_b32 s4, 0 14032; GFX10-NEXT: flat_load_dword v3, v[0:1] 14033; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 14034; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 14035; GFX10-NEXT: v_not_b32_e32 v5, v5 14036; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start 14037; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14038; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14039; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 14040; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 14041; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 14042; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 14043; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 14044; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 14045; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 14046; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14047; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 14048; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14049; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14050; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14051; GFX10-NEXT: buffer_gl1_inv 14052; GFX10-NEXT: buffer_gl0_inv 14053; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14054; GFX10-NEXT: v_mov_b32_e32 v3, v2 14055; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14056; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14057; GFX10-NEXT: s_cbranch_execnz .LBB55_1 14058; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14059; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14060; GFX10-NEXT: s_setpc_b64 s[30:31] 14061; 14062; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14063; GFX90A: ; %bb.0: 14064; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14065; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 14066; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 14067; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 14068; GFX90A-NEXT: flat_load_dword v3, v[0:1] 14069; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 14070; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 14071; GFX90A-NEXT: s_mov_b32 s4, 0xffff 14072; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 14073; GFX90A-NEXT: v_not_b32_e32 v5, v5 14074; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14075; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 14076; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 14077; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start 14078; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14079; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14080; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 14081; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 14082; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 14083; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 14084; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 14085; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14086; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 14087; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14088; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 14089; GFX90A-NEXT: buffer_wbl2 14090; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14091; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14092; GFX90A-NEXT: buffer_invl2 14093; GFX90A-NEXT: buffer_wbinvl1 14094; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14095; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14096; GFX90A-NEXT: v_mov_b32_e32 v3, v2 14097; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14098; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 14099; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14100; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14101; GFX90A-NEXT: s_setpc_b64 s[30:31] 14102; 14103; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14104; GFX908: ; %bb.0: 14105; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14106; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 14107; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 14108; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 14109; GFX908-NEXT: flat_load_dword v3, v[0:1] 14110; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 14111; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 14112; GFX908-NEXT: s_mov_b32 s4, 0xffff 14113; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 14114; GFX908-NEXT: v_not_b32_e32 v5, v5 14115; GFX908-NEXT: s_mov_b64 s[4:5], 0 14116; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 14117; GFX908-NEXT: s_movk_i32 s6, 0x7fff 14118; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start 14119; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14120; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14121; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 14122; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 14123; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 14124; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 14125; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 14126; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14127; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 14128; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14129; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 14130; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14131; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14132; GFX908-NEXT: buffer_wbinvl1 14133; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14134; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14135; GFX908-NEXT: v_mov_b32_e32 v3, v2 14136; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14137; GFX908-NEXT: s_cbranch_execnz .LBB55_1 14138; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14139; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14140; GFX908-NEXT: s_setpc_b64 s[30:31] 14141; 14142; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14143; GFX8: ; %bb.0: 14144; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14145; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 14146; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 14147; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 14148; GFX8-NEXT: flat_load_dword v3, v[0:1] 14149; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 14150; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 14151; GFX8-NEXT: s_mov_b32 s4, 0xffff 14152; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 14153; GFX8-NEXT: v_not_b32_e32 v5, v5 14154; GFX8-NEXT: s_mov_b64 s[4:5], 0 14155; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 14156; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start 14157; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14158; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14159; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 14160; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 14161; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 14162; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 14163; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 14164; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 14165; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14166; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 14167; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 14168; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14169; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 14170; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14171; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14172; GFX8-NEXT: buffer_wbinvl1 14173; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14174; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14175; GFX8-NEXT: v_mov_b32_e32 v3, v2 14176; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14177; GFX8-NEXT: s_cbranch_execnz .LBB55_1 14178; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14179; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14180; GFX8-NEXT: s_setpc_b64 s[30:31] 14181; 14182; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14183; GFX7: ; %bb.0: 14184; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14185; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 14186; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 14187; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 14188; GFX7-NEXT: flat_load_dword v3, v[0:1] 14189; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 14190; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 14191; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 14192; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14193; GFX7-NEXT: v_not_b32_e32 v5, v5 14194; GFX7-NEXT: s_mov_b64 s[4:5], 0 14195; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 14196; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start 14197; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14198; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14199; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 14200; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 14201; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 14202; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14203; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 14204; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 14205; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 14206; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14207; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14208; GFX7-NEXT: buffer_wbinvl1 14209; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14210; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14211; GFX7-NEXT: v_mov_b32_e32 v3, v2 14212; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14213; GFX7-NEXT: s_cbranch_execnz .LBB55_1 14214; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14215; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14216; GFX7-NEXT: s_setpc_b64 s[30:31] 14217 %gep = getelementptr bfloat, ptr %ptr, i64 1023 14218 %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 14219 ret void 14220} 14221 14222; -------------------------------------------------------------------- 14223; <2 x half> 14224; -------------------------------------------------------------------- 14225 14226define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 14227; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14228; GFX12: ; %bb.0: 14229; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14230; GFX12-NEXT: s_wait_expcnt 0x0 14231; GFX12-NEXT: s_wait_samplecnt 0x0 14232; GFX12-NEXT: s_wait_bvhcnt 0x0 14233; GFX12-NEXT: s_wait_kmcnt 0x0 14234; GFX12-NEXT: s_wait_storecnt 0x0 14235; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14236; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14237; GFX12-NEXT: global_inv scope:SCOPE_DEV 14238; GFX12-NEXT: s_setpc_b64 s[30:31] 14239; 14240; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14241; GFX940: ; %bb.0: 14242; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14243; GFX940-NEXT: buffer_wbl2 sc1 14244; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 14245; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14246; GFX940-NEXT: buffer_inv sc1 14247; GFX940-NEXT: s_setpc_b64 s[30:31] 14248; 14249; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14250; GFX11: ; %bb.0: 14251; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14252; GFX11-NEXT: flat_load_b32 v3, v[0:1] 14253; GFX11-NEXT: s_mov_b32 s0, 0 14254; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start 14255; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14256; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14257; GFX11-NEXT: v_mov_b32_e32 v4, v3 14258; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14259; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 14260; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14261; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 14262; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14263; GFX11-NEXT: buffer_gl1_inv 14264; GFX11-NEXT: buffer_gl0_inv 14265; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 14266; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14267; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14268; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14269; GFX11-NEXT: s_cbranch_execnz .LBB56_1 14270; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14271; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14272; GFX11-NEXT: v_mov_b32_e32 v0, v3 14273; GFX11-NEXT: s_setpc_b64 s[30:31] 14274; 14275; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14276; GFX10: ; %bb.0: 14277; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14278; GFX10-NEXT: flat_load_dword v3, v[0:1] 14279; GFX10-NEXT: s_mov_b32 s4, 0 14280; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start 14281; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14282; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14283; GFX10-NEXT: v_mov_b32_e32 v4, v3 14284; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 14285; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14286; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14287; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14288; GFX10-NEXT: buffer_gl1_inv 14289; GFX10-NEXT: buffer_gl0_inv 14290; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 14291; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14292; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14293; GFX10-NEXT: s_cbranch_execnz .LBB56_1 14294; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14295; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14296; GFX10-NEXT: v_mov_b32_e32 v0, v3 14297; GFX10-NEXT: s_setpc_b64 s[30:31] 14298; 14299; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14300; GFX90A: ; %bb.0: 14301; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14302; GFX90A-NEXT: flat_load_dword v3, v[0:1] 14303; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14304; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start 14305; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14306; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14307; GFX90A-NEXT: v_mov_b32_e32 v5, v3 14308; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 14309; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 14310; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14311; GFX90A-NEXT: buffer_wbinvl1 14312; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 14313; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14314; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14315; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 14316; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14317; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14318; GFX90A-NEXT: v_mov_b32_e32 v0, v3 14319; GFX90A-NEXT: s_setpc_b64 s[30:31] 14320; 14321; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14322; GFX908: ; %bb.0: 14323; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14324; GFX908-NEXT: flat_load_dword v3, v[0:1] 14325; GFX908-NEXT: s_mov_b64 s[4:5], 0 14326; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start 14327; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14328; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14329; GFX908-NEXT: v_mov_b32_e32 v4, v3 14330; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 14331; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14332; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14333; GFX908-NEXT: buffer_wbinvl1 14334; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 14335; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14336; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14337; GFX908-NEXT: s_cbranch_execnz .LBB56_1 14338; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14339; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14340; GFX908-NEXT: v_mov_b32_e32 v0, v3 14341; GFX908-NEXT: s_setpc_b64 s[30:31] 14342; 14343; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14344; GFX8: ; %bb.0: 14345; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14346; GFX8-NEXT: flat_load_dword v3, v[0:1] 14347; GFX8-NEXT: s_mov_b64 s[4:5], 0 14348; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start 14349; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14350; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14351; GFX8-NEXT: v_mov_b32_e32 v4, v3 14352; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 14353; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14354; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 14355; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 14356; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 14357; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14358; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14359; GFX8-NEXT: buffer_wbinvl1 14360; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 14361; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14362; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14363; GFX8-NEXT: s_cbranch_execnz .LBB56_1 14364; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14365; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14366; GFX8-NEXT: v_mov_b32_e32 v0, v3 14367; GFX8-NEXT: s_setpc_b64 s[30:31] 14368; 14369; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: 14370; GFX7: ; %bb.0: 14371; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14372; GFX7-NEXT: flat_load_dword v5, v[0:1] 14373; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 14374; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 14375; GFX7-NEXT: s_mov_b64 s[4:5], 0 14376; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 14377; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14378; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 14379; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 14380; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 14381; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 14382; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start 14383; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14384; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 14385; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 14386; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 14387; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 14388; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 14389; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 14390; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 14391; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 14392; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 14393; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 14394; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 14395; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 14396; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc 14397; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14398; GFX7-NEXT: buffer_wbinvl1 14399; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 14400; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 14401; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 14402; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 14403; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14404; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14405; GFX7-NEXT: s_cbranch_execnz .LBB56_1 14406; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14407; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14408; GFX7-NEXT: v_mov_b32_e32 v0, v2 14409; GFX7-NEXT: v_mov_b32_e32 v1, v3 14410; GFX7-NEXT: s_setpc_b64 s[30:31] 14411 %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14412 ret <2 x half> %result 14413} 14414 14415define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 14416; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14417; GFX12: ; %bb.0: 14418; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14419; GFX12-NEXT: s_wait_expcnt 0x0 14420; GFX12-NEXT: s_wait_samplecnt 0x0 14421; GFX12-NEXT: s_wait_bvhcnt 0x0 14422; GFX12-NEXT: s_wait_kmcnt 0x0 14423; GFX12-NEXT: s_wait_storecnt 0x0 14424; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14425; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14426; GFX12-NEXT: global_inv scope:SCOPE_DEV 14427; GFX12-NEXT: s_setpc_b64 s[30:31] 14428; 14429; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14430; GFX940: ; %bb.0: 14431; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14432; GFX940-NEXT: buffer_wbl2 sc1 14433; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 14434; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14435; GFX940-NEXT: buffer_inv sc1 14436; GFX940-NEXT: s_setpc_b64 s[30:31] 14437; 14438; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14439; GFX11: ; %bb.0: 14440; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14441; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14442; GFX11-NEXT: s_mov_b32 s0, 0 14443; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start 14444; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14445; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14446; GFX11-NEXT: v_mov_b32_e32 v4, v3 14447; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14448; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 14449; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14450; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 14451; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14452; GFX11-NEXT: buffer_gl1_inv 14453; GFX11-NEXT: buffer_gl0_inv 14454; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 14455; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14456; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14457; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14458; GFX11-NEXT: s_cbranch_execnz .LBB57_1 14459; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14460; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14461; GFX11-NEXT: v_mov_b32_e32 v0, v3 14462; GFX11-NEXT: s_setpc_b64 s[30:31] 14463; 14464; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14465; GFX10: ; %bb.0: 14466; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14467; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 14468; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 14469; GFX10-NEXT: s_mov_b32 s4, 0 14470; GFX10-NEXT: flat_load_dword v0, v[3:4] 14471; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start 14472; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14473; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14474; GFX10-NEXT: v_mov_b32_e32 v1, v0 14475; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 14476; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14477; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 14478; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14479; GFX10-NEXT: buffer_gl1_inv 14480; GFX10-NEXT: buffer_gl0_inv 14481; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 14482; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14483; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14484; GFX10-NEXT: s_cbranch_execnz .LBB57_1 14485; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14486; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14487; GFX10-NEXT: s_setpc_b64 s[30:31] 14488; 14489; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14490; GFX90A: ; %bb.0: 14491; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14492; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 14493; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14494; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start 14495; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14496; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14497; GFX90A-NEXT: v_mov_b32_e32 v5, v3 14498; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 14499; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 14500; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14501; GFX90A-NEXT: buffer_wbinvl1 14502; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 14503; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14504; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14505; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 14506; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14507; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14508; GFX90A-NEXT: v_mov_b32_e32 v0, v3 14509; GFX90A-NEXT: s_setpc_b64 s[30:31] 14510; 14511; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14512; GFX908: ; %bb.0: 14513; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14514; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 14515; GFX908-NEXT: s_mov_b64 s[4:5], 0 14516; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start 14517; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14518; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14519; GFX908-NEXT: v_mov_b32_e32 v4, v3 14520; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 14521; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 14522; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14523; GFX908-NEXT: buffer_wbinvl1 14524; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 14525; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14526; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14527; GFX908-NEXT: s_cbranch_execnz .LBB57_1 14528; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14529; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14530; GFX908-NEXT: v_mov_b32_e32 v0, v3 14531; GFX908-NEXT: s_setpc_b64 s[30:31] 14532; 14533; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14534; GFX8: ; %bb.0: 14535; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14536; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 14537; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 14538; GFX8-NEXT: flat_load_dword v0, v[3:4] 14539; GFX8-NEXT: s_mov_b64 s[4:5], 0 14540; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start 14541; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14542; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14543; GFX8-NEXT: v_mov_b32_e32 v1, v0 14544; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 14545; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14546; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 14547; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14548; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 14549; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 14550; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14551; GFX8-NEXT: buffer_wbinvl1 14552; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14553; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14554; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14555; GFX8-NEXT: s_cbranch_execnz .LBB57_1 14556; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14557; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14558; GFX8-NEXT: s_setpc_b64 s[30:31] 14559; 14560; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14561; GFX7: ; %bb.0: 14562; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14563; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 14564; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 14565; GFX7-NEXT: flat_load_dword v1, v[4:5] 14566; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 14567; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 14568; GFX7-NEXT: s_mov_b64 s[4:5], 0 14569; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 14570; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 14571; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14572; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 14573; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14574; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 14575; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start 14576; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14577; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 14578; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 14579; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 14580; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 14581; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 14582; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 14583; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 14584; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 14585; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 14586; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 14587; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14588; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 14589; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 14590; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14591; GFX7-NEXT: buffer_wbinvl1 14592; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 14593; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 14594; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 14595; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 14596; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14597; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14598; GFX7-NEXT: s_cbranch_execnz .LBB57_1 14599; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14600; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14601; GFX7-NEXT: s_setpc_b64 s[30:31] 14602 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 14603 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14604 ret <2 x half> %result 14605} 14606 14607define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 14608; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14609; GFX12: ; %bb.0: 14610; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14611; GFX12-NEXT: s_wait_expcnt 0x0 14612; GFX12-NEXT: s_wait_samplecnt 0x0 14613; GFX12-NEXT: s_wait_bvhcnt 0x0 14614; GFX12-NEXT: s_wait_kmcnt 0x0 14615; GFX12-NEXT: s_wait_storecnt 0x0 14616; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14617; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14618; GFX12-NEXT: global_inv scope:SCOPE_DEV 14619; GFX12-NEXT: s_setpc_b64 s[30:31] 14620; 14621; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14622; GFX940: ; %bb.0: 14623; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14624; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 14625; GFX940-NEXT: s_nop 1 14626; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 14627; GFX940-NEXT: buffer_wbl2 sc1 14628; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 14629; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14630; GFX940-NEXT: buffer_inv sc1 14631; GFX940-NEXT: s_setpc_b64 s[30:31] 14632; 14633; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14634; GFX11: ; %bb.0: 14635; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14636; GFX11-NEXT: v_mov_b32_e32 v3, v0 14637; GFX11-NEXT: s_mov_b32 s0, 0 14638; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14639; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 14640; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 14641; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 14642; GFX11-NEXT: flat_load_b32 v0, v[4:5] 14643; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 14644; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start 14645; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14646; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14647; GFX11-NEXT: v_mov_b32_e32 v1, v0 14648; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14649; GFX11-NEXT: v_pk_add_f16 v0, v1, v2 14650; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14651; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc 14652; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14653; GFX11-NEXT: buffer_gl1_inv 14654; GFX11-NEXT: buffer_gl0_inv 14655; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 14656; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14657; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14658; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14659; GFX11-NEXT: s_cbranch_execnz .LBB58_1 14660; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14661; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14662; GFX11-NEXT: s_setpc_b64 s[30:31] 14663; 14664; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14665; GFX10: ; %bb.0: 14666; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14667; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 14668; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 14669; GFX10-NEXT: s_mov_b32 s4, 0 14670; GFX10-NEXT: flat_load_dword v0, v[3:4] 14671; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start 14672; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14673; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14674; GFX10-NEXT: v_mov_b32_e32 v1, v0 14675; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 14676; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14677; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 14678; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14679; GFX10-NEXT: buffer_gl1_inv 14680; GFX10-NEXT: buffer_gl0_inv 14681; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 14682; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14683; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14684; GFX10-NEXT: s_cbranch_execnz .LBB58_1 14685; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14686; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14687; GFX10-NEXT: s_setpc_b64 s[30:31] 14688; 14689; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14690; GFX90A: ; %bb.0: 14691; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14692; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 14693; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 14694; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 14695; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 14696; GFX90A-NEXT: flat_load_dword v0, v[0:1] 14697; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14698; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start 14699; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14700; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14701; GFX90A-NEXT: v_mov_b32_e32 v1, v0 14702; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 14703; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 14704; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14705; GFX90A-NEXT: buffer_wbinvl1 14706; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14707; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14708; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14709; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 14710; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14711; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14712; GFX90A-NEXT: s_setpc_b64 s[30:31] 14713; 14714; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14715; GFX908: ; %bb.0: 14716; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14717; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 14718; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 14719; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 14720; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 14721; GFX908-NEXT: flat_load_dword v0, v[0:1] 14722; GFX908-NEXT: s_mov_b64 s[4:5], 0 14723; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start 14724; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14725; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14726; GFX908-NEXT: v_mov_b32_e32 v1, v0 14727; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 14728; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 14729; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14730; GFX908-NEXT: buffer_wbinvl1 14731; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14732; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14733; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14734; GFX908-NEXT: s_cbranch_execnz .LBB58_1 14735; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14736; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14737; GFX908-NEXT: s_setpc_b64 s[30:31] 14738; 14739; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14740; GFX8: ; %bb.0: 14741; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14742; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 14743; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 14744; GFX8-NEXT: flat_load_dword v0, v[3:4] 14745; GFX8-NEXT: s_mov_b64 s[4:5], 0 14746; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start 14747; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14748; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14749; GFX8-NEXT: v_mov_b32_e32 v1, v0 14750; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 14751; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14752; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 14753; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14754; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 14755; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 14756; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14757; GFX8-NEXT: buffer_wbinvl1 14758; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14759; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14760; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14761; GFX8-NEXT: s_cbranch_execnz .LBB58_1 14762; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14763; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14764; GFX8-NEXT: s_setpc_b64 s[30:31] 14765; 14766; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 14767; GFX7: ; %bb.0: 14768; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14769; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 14770; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 14771; GFX7-NEXT: flat_load_dword v1, v[4:5] 14772; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 14773; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 14774; GFX7-NEXT: s_mov_b64 s[4:5], 0 14775; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 14776; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 14777; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14778; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 14779; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14780; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 14781; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start 14782; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14783; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 14784; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 14785; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 14786; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 14787; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 14788; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 14789; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 14790; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 14791; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 14792; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 14793; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14794; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 14795; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 14796; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14797; GFX7-NEXT: buffer_wbinvl1 14798; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 14799; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 14800; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 14801; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 14802; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14803; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14804; GFX7-NEXT: s_cbranch_execnz .LBB58_1 14805; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14806; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14807; GFX7-NEXT: s_setpc_b64 s[30:31] 14808 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 14809 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14810 ret <2 x half> %result 14811} 14812 14813define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 14814; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14815; GFX12: ; %bb.0: 14816; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14817; GFX12-NEXT: s_wait_expcnt 0x0 14818; GFX12-NEXT: s_wait_samplecnt 0x0 14819; GFX12-NEXT: s_wait_bvhcnt 0x0 14820; GFX12-NEXT: s_wait_kmcnt 0x0 14821; GFX12-NEXT: s_wait_storecnt 0x0 14822; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV 14823; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 14824; GFX12-NEXT: global_inv scope:SCOPE_DEV 14825; GFX12-NEXT: s_setpc_b64 s[30:31] 14826; 14827; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14828; GFX940: ; %bb.0: 14829; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14830; GFX940-NEXT: buffer_wbl2 sc1 14831; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 14832; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14833; GFX940-NEXT: buffer_inv sc1 14834; GFX940-NEXT: s_setpc_b64 s[30:31] 14835; 14836; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14837; GFX11: ; %bb.0: 14838; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14839; GFX11-NEXT: flat_load_b32 v4, v[0:1] 14840; GFX11-NEXT: s_mov_b32 s0, 0 14841; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start 14842; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14843; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14844; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 14845; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14846; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 14847; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14848; GFX11-NEXT: buffer_gl1_inv 14849; GFX11-NEXT: buffer_gl0_inv 14850; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 14851; GFX11-NEXT: v_mov_b32_e32 v4, v3 14852; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14853; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14854; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14855; GFX11-NEXT: s_cbranch_execnz .LBB59_1 14856; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14857; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14858; GFX11-NEXT: s_setpc_b64 s[30:31] 14859; 14860; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14861; GFX10: ; %bb.0: 14862; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14863; GFX10-NEXT: flat_load_dword v4, v[0:1] 14864; GFX10-NEXT: s_mov_b32 s4, 0 14865; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start 14866; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14867; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14868; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 14869; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14870; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14871; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14872; GFX10-NEXT: buffer_gl1_inv 14873; GFX10-NEXT: buffer_gl0_inv 14874; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 14875; GFX10-NEXT: v_mov_b32_e32 v4, v3 14876; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14877; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14878; GFX10-NEXT: s_cbranch_execnz .LBB59_1 14879; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14880; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14881; GFX10-NEXT: s_setpc_b64 s[30:31] 14882; 14883; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14884; GFX90A: ; %bb.0: 14885; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14886; GFX90A-NEXT: flat_load_dword v5, v[0:1] 14887; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14888; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start 14889; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14890; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14891; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 14892; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 14893; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14894; GFX90A-NEXT: buffer_wbinvl1 14895; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 14896; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14897; GFX90A-NEXT: v_mov_b32_e32 v5, v3 14898; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14899; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 14900; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14901; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14902; GFX90A-NEXT: s_setpc_b64 s[30:31] 14903; 14904; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14905; GFX908: ; %bb.0: 14906; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14907; GFX908-NEXT: flat_load_dword v4, v[0:1] 14908; GFX908-NEXT: s_mov_b64 s[4:5], 0 14909; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start 14910; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14911; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14912; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 14913; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14914; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14915; GFX908-NEXT: buffer_wbinvl1 14916; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 14917; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14918; GFX908-NEXT: v_mov_b32_e32 v4, v3 14919; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14920; GFX908-NEXT: s_cbranch_execnz .LBB59_1 14921; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14922; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14923; GFX908-NEXT: s_setpc_b64 s[30:31] 14924; 14925; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14926; GFX8: ; %bb.0: 14927; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14928; GFX8-NEXT: flat_load_dword v4, v[0:1] 14929; GFX8-NEXT: s_mov_b64 s[4:5], 0 14930; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start 14931; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14932; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14933; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 14934; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14935; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 14936; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 14937; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 14938; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 14939; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14940; GFX8-NEXT: buffer_wbinvl1 14941; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 14942; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14943; GFX8-NEXT: v_mov_b32_e32 v4, v3 14944; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14945; GFX8-NEXT: s_cbranch_execnz .LBB59_1 14946; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14947; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14948; GFX8-NEXT: s_setpc_b64 s[30:31] 14949; 14950; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: 14951; GFX7: ; %bb.0: 14952; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14953; GFX7-NEXT: flat_load_dword v5, v[0:1] 14954; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 14955; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 14956; GFX7-NEXT: s_mov_b64 s[4:5], 0 14957; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 14958; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14959; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 14960; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 14961; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 14962; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 14963; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start 14964; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14965; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 14966; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 14967; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 14968; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 14969; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 14970; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 14971; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 14972; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 14973; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 14974; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 14975; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 14976; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 14977; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 14978; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14979; GFX7-NEXT: buffer_wbinvl1 14980; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 14981; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 14982; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 14983; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 14984; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14985; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14986; GFX7-NEXT: s_cbranch_execnz .LBB59_1 14987; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14988; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14989; GFX7-NEXT: s_setpc_b64 s[30:31] 14990 %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14991 ret void 14992} 14993 14994define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 14995; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14996; GFX12: ; %bb.0: 14997; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14998; GFX12-NEXT: s_wait_expcnt 0x0 14999; GFX12-NEXT: s_wait_samplecnt 0x0 15000; GFX12-NEXT: s_wait_bvhcnt 0x0 15001; GFX12-NEXT: s_wait_kmcnt 0x0 15002; GFX12-NEXT: s_wait_storecnt 0x0 15003; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV 15004; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 15005; GFX12-NEXT: global_inv scope:SCOPE_DEV 15006; GFX12-NEXT: s_setpc_b64 s[30:31] 15007; 15008; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15009; GFX940: ; %bb.0: 15010; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15011; GFX940-NEXT: buffer_wbl2 sc1 15012; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 15013; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15014; GFX940-NEXT: buffer_inv sc1 15015; GFX940-NEXT: s_setpc_b64 s[30:31] 15016; 15017; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15018; GFX11: ; %bb.0: 15019; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15020; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 15021; GFX11-NEXT: s_mov_b32 s0, 0 15022; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start 15023; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15024; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15025; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15026; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15027; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 15028; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15029; GFX11-NEXT: buffer_gl1_inv 15030; GFX11-NEXT: buffer_gl0_inv 15031; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15032; GFX11-NEXT: v_mov_b32_e32 v4, v3 15033; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 15034; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15035; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 15036; GFX11-NEXT: s_cbranch_execnz .LBB60_1 15037; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15038; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 15039; GFX11-NEXT: s_setpc_b64 s[30:31] 15040; 15041; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15042; GFX10: ; %bb.0: 15043; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15044; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 15045; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 15046; GFX10-NEXT: s_mov_b32 s4, 0 15047; GFX10-NEXT: flat_load_dword v4, v[0:1] 15048; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start 15049; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15050; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15051; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 15052; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15053; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15054; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15055; GFX10-NEXT: buffer_gl1_inv 15056; GFX10-NEXT: buffer_gl0_inv 15057; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15058; GFX10-NEXT: v_mov_b32_e32 v4, v3 15059; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 15060; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 15061; GFX10-NEXT: s_cbranch_execnz .LBB60_1 15062; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15063; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 15064; GFX10-NEXT: s_setpc_b64 s[30:31] 15065; 15066; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15067; GFX90A: ; %bb.0: 15068; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15069; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 15070; GFX90A-NEXT: s_mov_b64 s[4:5], 0 15071; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start 15072; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15073; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15074; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 15075; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 15076; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15077; GFX90A-NEXT: buffer_wbinvl1 15078; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 15079; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15080; GFX90A-NEXT: v_mov_b32_e32 v5, v3 15081; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 15082; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 15083; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15084; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 15085; GFX90A-NEXT: s_setpc_b64 s[30:31] 15086; 15087; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15088; GFX908: ; %bb.0: 15089; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15090; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 15091; GFX908-NEXT: s_mov_b64 s[4:5], 0 15092; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start 15093; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15094; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15095; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 15096; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 15097; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15098; GFX908-NEXT: buffer_wbinvl1 15099; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15100; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15101; GFX908-NEXT: v_mov_b32_e32 v4, v3 15102; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 15103; GFX908-NEXT: s_cbranch_execnz .LBB60_1 15104; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15105; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 15106; GFX908-NEXT: s_setpc_b64 s[30:31] 15107; 15108; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15109; GFX8: ; %bb.0: 15110; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15111; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 15112; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15113; GFX8-NEXT: flat_load_dword v4, v[0:1] 15114; GFX8-NEXT: s_mov_b64 s[4:5], 0 15115; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start 15116; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15117; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15118; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 15119; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 15120; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 15121; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 15122; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 15123; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15124; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15125; GFX8-NEXT: buffer_wbinvl1 15126; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15127; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15128; GFX8-NEXT: v_mov_b32_e32 v4, v3 15129; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 15130; GFX8-NEXT: s_cbranch_execnz .LBB60_1 15131; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15132; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 15133; GFX8-NEXT: s_setpc_b64 s[30:31] 15134; 15135; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15136; GFX7: ; %bb.0: 15137; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15138; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 15139; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15140; GFX7-NEXT: flat_load_dword v5, v[0:1] 15141; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 15142; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 15143; GFX7-NEXT: s_mov_b64 s[4:5], 0 15144; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 15145; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15146; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 15147; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 15148; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 15149; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 15150; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start 15151; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15152; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 15153; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 15154; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 15155; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 15156; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15157; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 15158; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 15159; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 15160; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 15161; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 15162; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 15163; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 15164; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 15165; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15166; GFX7-NEXT: buffer_wbinvl1 15167; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 15168; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 15169; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 15170; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 15171; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15172; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15173; GFX7-NEXT: s_cbranch_execnz .LBB60_1 15174; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15175; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15176; GFX7-NEXT: s_setpc_b64 s[30:31] 15177 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 15178 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 15179 ret void 15180} 15181 15182define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 15183; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15184; GFX12: ; %bb.0: 15185; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15186; GFX12-NEXT: s_wait_expcnt 0x0 15187; GFX12-NEXT: s_wait_samplecnt 0x0 15188; GFX12-NEXT: s_wait_bvhcnt 0x0 15189; GFX12-NEXT: s_wait_kmcnt 0x0 15190; GFX12-NEXT: s_wait_storecnt 0x0 15191; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 15192; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 15193; GFX12-NEXT: global_inv scope:SCOPE_DEV 15194; GFX12-NEXT: s_setpc_b64 s[30:31] 15195; 15196; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15197; GFX940: ; %bb.0: 15198; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15199; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15200; GFX940-NEXT: s_nop 1 15201; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15202; GFX940-NEXT: buffer_wbl2 sc1 15203; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 15204; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15205; GFX940-NEXT: buffer_inv sc1 15206; GFX940-NEXT: s_setpc_b64 s[30:31] 15207; 15208; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15209; GFX11: ; %bb.0: 15210; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15211; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 15212; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 15213; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 15214; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 15215; GFX11-NEXT: flat_load_b32 v4, v[3:4] 15216; GFX11-NEXT: s_mov_b32 s0, 0 15217; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start 15218; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15219; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15220; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15221; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15222; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 15223; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15224; GFX11-NEXT: buffer_gl1_inv 15225; GFX11-NEXT: buffer_gl0_inv 15226; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15227; GFX11-NEXT: v_mov_b32_e32 v4, v3 15228; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 15229; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15230; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 15231; GFX11-NEXT: s_cbranch_execnz .LBB61_1 15232; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15233; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 15234; GFX11-NEXT: s_setpc_b64 s[30:31] 15235; 15236; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15237; GFX10: ; %bb.0: 15238; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15239; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 15240; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 15241; GFX10-NEXT: s_mov_b32 s4, 0 15242; GFX10-NEXT: flat_load_dword v4, v[0:1] 15243; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start 15244; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15245; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15246; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 15247; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15248; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15249; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15250; GFX10-NEXT: buffer_gl1_inv 15251; GFX10-NEXT: buffer_gl0_inv 15252; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15253; GFX10-NEXT: v_mov_b32_e32 v4, v3 15254; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 15255; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 15256; GFX10-NEXT: s_cbranch_execnz .LBB61_1 15257; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15258; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 15259; GFX10-NEXT: s_setpc_b64 s[30:31] 15260; 15261; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15262; GFX90A: ; %bb.0: 15263; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15264; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 15265; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 15266; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15267; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15268; GFX90A-NEXT: flat_load_dword v1, v[0:1] 15269; GFX90A-NEXT: s_mov_b64 s[4:5], 0 15270; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start 15271; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15272; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15273; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 15274; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 15275; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15276; GFX90A-NEXT: buffer_wbinvl1 15277; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15278; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15279; GFX90A-NEXT: v_mov_b32_e32 v1, v0 15280; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 15281; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 15282; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15283; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 15284; GFX90A-NEXT: s_setpc_b64 s[30:31] 15285; 15286; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15287; GFX908: ; %bb.0: 15288; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15289; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 15290; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 15291; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15292; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15293; GFX908-NEXT: flat_load_dword v1, v[0:1] 15294; GFX908-NEXT: s_mov_b64 s[4:5], 0 15295; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start 15296; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15297; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15298; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 15299; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 15300; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15301; GFX908-NEXT: buffer_wbinvl1 15302; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15303; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15304; GFX908-NEXT: v_mov_b32_e32 v1, v0 15305; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 15306; GFX908-NEXT: s_cbranch_execnz .LBB61_1 15307; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15308; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 15309; GFX908-NEXT: s_setpc_b64 s[30:31] 15310; 15311; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15312; GFX8: ; %bb.0: 15313; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15314; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 15315; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 15316; GFX8-NEXT: flat_load_dword v4, v[0:1] 15317; GFX8-NEXT: s_mov_b64 s[4:5], 0 15318; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start 15319; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15320; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15321; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 15322; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 15323; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 15324; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 15325; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 15326; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15327; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15328; GFX8-NEXT: buffer_wbinvl1 15329; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15330; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15331; GFX8-NEXT: v_mov_b32_e32 v4, v3 15332; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 15333; GFX8-NEXT: s_cbranch_execnz .LBB61_1 15334; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15335; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 15336; GFX8-NEXT: s_setpc_b64 s[30:31] 15337; 15338; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 15339; GFX7: ; %bb.0: 15340; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15341; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 15342; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 15343; GFX7-NEXT: flat_load_dword v5, v[0:1] 15344; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 15345; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 15346; GFX7-NEXT: s_mov_b64 s[4:5], 0 15347; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 15348; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15349; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 15350; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 15351; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 15352; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 15353; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start 15354; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15355; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 15356; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 15357; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 15358; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 15359; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15360; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 15361; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 15362; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 15363; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 15364; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 15365; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 15366; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 15367; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 15368; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15369; GFX7-NEXT: buffer_wbinvl1 15370; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 15371; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 15372; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 15373; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 15374; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15375; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15376; GFX7-NEXT: s_cbranch_execnz .LBB61_1 15377; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15378; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15379; GFX7-NEXT: s_setpc_b64 s[30:31] 15380 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 15381 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 15382 ret void 15383} 15384 15385define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 15386; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15387; GFX12: ; %bb.0: 15388; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15389; GFX12-NEXT: s_wait_expcnt 0x0 15390; GFX12-NEXT: s_wait_samplecnt 0x0 15391; GFX12-NEXT: s_wait_bvhcnt 0x0 15392; GFX12-NEXT: s_wait_kmcnt 0x0 15393; GFX12-NEXT: global_wb scope:SCOPE_SYS 15394; GFX12-NEXT: s_wait_storecnt 0x0 15395; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 15396; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15397; GFX12-NEXT: global_inv scope:SCOPE_SYS 15398; GFX12-NEXT: s_setpc_b64 s[30:31] 15399; 15400; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15401; GFX940: ; %bb.0: 15402; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15403; GFX940-NEXT: buffer_wbl2 sc0 sc1 15404; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1 15405; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15406; GFX940-NEXT: buffer_inv sc0 sc1 15407; GFX940-NEXT: s_setpc_b64 s[30:31] 15408; 15409; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15410; GFX11: ; %bb.0: 15411; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15412; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15413; GFX11-NEXT: s_mov_b32 s0, 0 15414; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start 15415; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15416; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15417; GFX11-NEXT: v_mov_b32_e32 v4, v3 15418; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15419; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15420; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15421; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 15422; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15423; GFX11-NEXT: buffer_gl1_inv 15424; GFX11-NEXT: buffer_gl0_inv 15425; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15426; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 15427; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15428; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 15429; GFX11-NEXT: s_cbranch_execnz .LBB62_1 15430; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15431; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 15432; GFX11-NEXT: v_mov_b32_e32 v0, v3 15433; GFX11-NEXT: s_setpc_b64 s[30:31] 15434; 15435; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15436; GFX10: ; %bb.0: 15437; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15438; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 15439; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 15440; GFX10-NEXT: s_mov_b32 s4, 0 15441; GFX10-NEXT: flat_load_dword v0, v[3:4] 15442; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start 15443; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15444; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15445; GFX10-NEXT: v_mov_b32_e32 v1, v0 15446; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 15447; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15448; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 15449; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15450; GFX10-NEXT: buffer_gl1_inv 15451; GFX10-NEXT: buffer_gl0_inv 15452; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 15453; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 15454; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 15455; GFX10-NEXT: s_cbranch_execnz .LBB62_1 15456; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15457; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 15458; GFX10-NEXT: s_setpc_b64 s[30:31] 15459; 15460; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15461; GFX90A: ; %bb.0: 15462; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15463; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 15464; GFX90A-NEXT: s_mov_b64 s[4:5], 0 15465; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start 15466; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15467; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15468; GFX90A-NEXT: v_mov_b32_e32 v5, v3 15469; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 15470; GFX90A-NEXT: buffer_wbl2 15471; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 15472; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15473; GFX90A-NEXT: buffer_invl2 15474; GFX90A-NEXT: buffer_wbinvl1 15475; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 15476; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15477; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 15478; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 15479; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15480; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 15481; GFX90A-NEXT: v_mov_b32_e32 v0, v3 15482; GFX90A-NEXT: s_setpc_b64 s[30:31] 15483; 15484; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15485; GFX908: ; %bb.0: 15486; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15487; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 15488; GFX908-NEXT: s_mov_b64 s[4:5], 0 15489; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start 15490; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15491; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15492; GFX908-NEXT: v_mov_b32_e32 v4, v3 15493; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 15494; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 15495; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15496; GFX908-NEXT: buffer_wbinvl1 15497; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15498; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15499; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 15500; GFX908-NEXT: s_cbranch_execnz .LBB62_1 15501; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15502; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 15503; GFX908-NEXT: v_mov_b32_e32 v0, v3 15504; GFX908-NEXT: s_setpc_b64 s[30:31] 15505; 15506; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15507; GFX8: ; %bb.0: 15508; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15509; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 15510; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 15511; GFX8-NEXT: flat_load_dword v0, v[3:4] 15512; GFX8-NEXT: s_mov_b64 s[4:5], 0 15513; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start 15514; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15515; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15516; GFX8-NEXT: v_mov_b32_e32 v1, v0 15517; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 15518; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 15519; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 15520; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 15521; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 15522; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 15523; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15524; GFX8-NEXT: buffer_wbinvl1 15525; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15526; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15527; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 15528; GFX8-NEXT: s_cbranch_execnz .LBB62_1 15529; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15530; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 15531; GFX8-NEXT: s_setpc_b64 s[30:31] 15532; 15533; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15534; GFX7: ; %bb.0: 15535; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15536; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 15537; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 15538; GFX7-NEXT: flat_load_dword v1, v[4:5] 15539; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 15540; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 15541; GFX7-NEXT: s_mov_b64 s[4:5], 0 15542; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 15543; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 15544; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15545; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 15546; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 15547; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 15548; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start 15549; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15550; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 15551; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 15552; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 15553; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 15554; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 15555; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 15556; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 15557; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 15558; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 15559; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 15560; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15561; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 15562; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 15563; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15564; GFX7-NEXT: buffer_wbinvl1 15565; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 15566; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 15567; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 15568; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 15569; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15570; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15571; GFX7-NEXT: s_cbranch_execnz .LBB62_1 15572; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15573; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15574; GFX7-NEXT: s_setpc_b64 s[30:31] 15575 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 15576 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 15577 ret <2 x half> %result 15578} 15579 15580define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 15581; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15582; GFX12: ; %bb.0: 15583; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15584; GFX12-NEXT: s_wait_expcnt 0x0 15585; GFX12-NEXT: s_wait_samplecnt 0x0 15586; GFX12-NEXT: s_wait_bvhcnt 0x0 15587; GFX12-NEXT: s_wait_kmcnt 0x0 15588; GFX12-NEXT: global_wb scope:SCOPE_SYS 15589; GFX12-NEXT: s_wait_storecnt 0x0 15590; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS 15591; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 15592; GFX12-NEXT: global_inv scope:SCOPE_SYS 15593; GFX12-NEXT: s_setpc_b64 s[30:31] 15594; 15595; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15596; GFX940: ; %bb.0: 15597; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15598; GFX940-NEXT: buffer_wbl2 sc0 sc1 15599; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1 15600; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15601; GFX940-NEXT: buffer_inv sc0 sc1 15602; GFX940-NEXT: s_setpc_b64 s[30:31] 15603; 15604; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15605; GFX11: ; %bb.0: 15606; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15607; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 15608; GFX11-NEXT: s_mov_b32 s0, 0 15609; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start 15610; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15611; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15612; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15613; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15614; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 15615; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15616; GFX11-NEXT: buffer_gl1_inv 15617; GFX11-NEXT: buffer_gl0_inv 15618; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15619; GFX11-NEXT: v_mov_b32_e32 v4, v3 15620; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 15621; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15622; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 15623; GFX11-NEXT: s_cbranch_execnz .LBB63_1 15624; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15625; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 15626; GFX11-NEXT: s_setpc_b64 s[30:31] 15627; 15628; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15629; GFX10: ; %bb.0: 15630; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15631; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 15632; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 15633; GFX10-NEXT: s_mov_b32 s4, 0 15634; GFX10-NEXT: flat_load_dword v4, v[0:1] 15635; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start 15636; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15637; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15638; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 15639; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15640; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15641; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15642; GFX10-NEXT: buffer_gl1_inv 15643; GFX10-NEXT: buffer_gl0_inv 15644; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15645; GFX10-NEXT: v_mov_b32_e32 v4, v3 15646; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 15647; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 15648; GFX10-NEXT: s_cbranch_execnz .LBB63_1 15649; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15650; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 15651; GFX10-NEXT: s_setpc_b64 s[30:31] 15652; 15653; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15654; GFX90A: ; %bb.0: 15655; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15656; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 15657; GFX90A-NEXT: s_mov_b64 s[4:5], 0 15658; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start 15659; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15660; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15661; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 15662; GFX90A-NEXT: buffer_wbl2 15663; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 15664; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15665; GFX90A-NEXT: buffer_invl2 15666; GFX90A-NEXT: buffer_wbinvl1 15667; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 15668; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15669; GFX90A-NEXT: v_mov_b32_e32 v5, v3 15670; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 15671; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 15672; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15673; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 15674; GFX90A-NEXT: s_setpc_b64 s[30:31] 15675; 15676; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15677; GFX908: ; %bb.0: 15678; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15679; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 15680; GFX908-NEXT: s_mov_b64 s[4:5], 0 15681; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start 15682; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15683; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15684; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 15685; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 15686; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15687; GFX908-NEXT: buffer_wbinvl1 15688; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15689; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15690; GFX908-NEXT: v_mov_b32_e32 v4, v3 15691; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 15692; GFX908-NEXT: s_cbranch_execnz .LBB63_1 15693; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15694; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 15695; GFX908-NEXT: s_setpc_b64 s[30:31] 15696; 15697; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15698; GFX8: ; %bb.0: 15699; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15700; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 15701; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15702; GFX8-NEXT: flat_load_dword v4, v[0:1] 15703; GFX8-NEXT: s_mov_b64 s[4:5], 0 15704; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start 15705; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15706; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15707; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 15708; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 15709; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 15710; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 15711; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 15712; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15713; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15714; GFX8-NEXT: buffer_wbinvl1 15715; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15716; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15717; GFX8-NEXT: v_mov_b32_e32 v4, v3 15718; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 15719; GFX8-NEXT: s_cbranch_execnz .LBB63_1 15720; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15721; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 15722; GFX8-NEXT: s_setpc_b64 s[30:31] 15723; 15724; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 15725; GFX7: ; %bb.0: 15726; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15727; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 15728; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15729; GFX7-NEXT: flat_load_dword v5, v[0:1] 15730; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 15731; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 15732; GFX7-NEXT: s_mov_b64 s[4:5], 0 15733; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 15734; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15735; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 15736; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 15737; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 15738; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 15739; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start 15740; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15741; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 15742; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 15743; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 15744; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 15745; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15746; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 15747; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 15748; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 15749; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 15750; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 15751; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 15752; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 15753; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 15754; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15755; GFX7-NEXT: buffer_wbinvl1 15756; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 15757; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 15758; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 15759; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 15760; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15761; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15762; GFX7-NEXT: s_cbranch_execnz .LBB63_1 15763; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15764; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15765; GFX7-NEXT: s_setpc_b64 s[30:31] 15766 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 15767 %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 15768 ret void 15769} 15770 15771define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 { 15772; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15773; GFX12: ; %bb.0: 15774; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15775; GFX12-NEXT: s_wait_expcnt 0x0 15776; GFX12-NEXT: s_wait_samplecnt 0x0 15777; GFX12-NEXT: s_wait_bvhcnt 0x0 15778; GFX12-NEXT: s_wait_kmcnt 0x0 15779; GFX12-NEXT: s_wait_storecnt 0x0 15780; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15781; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15782; GFX12-NEXT: global_inv scope:SCOPE_DEV 15783; GFX12-NEXT: s_setpc_b64 s[30:31] 15784; 15785; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15786; GFX940: ; %bb.0: 15787; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15788; GFX940-NEXT: buffer_wbl2 sc1 15789; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 15790; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15791; GFX940-NEXT: buffer_inv sc1 15792; GFX940-NEXT: s_setpc_b64 s[30:31] 15793; 15794; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15795; GFX11: ; %bb.0: 15796; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15797; GFX11-NEXT: flat_load_b32 v3, v[0:1] 15798; GFX11-NEXT: s_mov_b32 s0, 0 15799; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start 15800; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15801; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15802; GFX11-NEXT: v_mov_b32_e32 v4, v3 15803; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15804; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15805; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15806; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 15807; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15808; GFX11-NEXT: buffer_gl1_inv 15809; GFX11-NEXT: buffer_gl0_inv 15810; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15811; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 15812; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15813; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 15814; GFX11-NEXT: s_cbranch_execnz .LBB64_1 15815; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15816; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 15817; GFX11-NEXT: v_mov_b32_e32 v0, v3 15818; GFX11-NEXT: s_setpc_b64 s[30:31] 15819; 15820; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15821; GFX10: ; %bb.0: 15822; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15823; GFX10-NEXT: flat_load_dword v3, v[0:1] 15824; GFX10-NEXT: s_mov_b32 s4, 0 15825; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start 15826; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15827; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15828; GFX10-NEXT: v_mov_b32_e32 v4, v3 15829; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 15830; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15831; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15832; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15833; GFX10-NEXT: buffer_gl1_inv 15834; GFX10-NEXT: buffer_gl0_inv 15835; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15836; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 15837; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 15838; GFX10-NEXT: s_cbranch_execnz .LBB64_1 15839; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15840; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 15841; GFX10-NEXT: v_mov_b32_e32 v0, v3 15842; GFX10-NEXT: s_setpc_b64 s[30:31] 15843; 15844; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15845; GFX90A: ; %bb.0: 15846; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15847; GFX90A-NEXT: flat_load_dword v3, v[0:1] 15848; GFX90A-NEXT: s_mov_b64 s[4:5], 0 15849; GFX90A-NEXT: .LBB64_1: ; %atomicrmw.start 15850; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15851; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15852; GFX90A-NEXT: v_mov_b32_e32 v5, v3 15853; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 15854; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 15855; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15856; GFX90A-NEXT: buffer_wbinvl1 15857; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 15858; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15859; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 15860; GFX90A-NEXT: s_cbranch_execnz .LBB64_1 15861; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15862; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 15863; GFX90A-NEXT: v_mov_b32_e32 v0, v3 15864; GFX90A-NEXT: s_setpc_b64 s[30:31] 15865; 15866; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15867; GFX908: ; %bb.0: 15868; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15869; GFX908-NEXT: flat_load_dword v3, v[0:1] 15870; GFX908-NEXT: s_mov_b64 s[4:5], 0 15871; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start 15872; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15873; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15874; GFX908-NEXT: v_mov_b32_e32 v4, v3 15875; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 15876; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15877; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15878; GFX908-NEXT: buffer_wbinvl1 15879; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15880; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15881; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 15882; GFX908-NEXT: s_cbranch_execnz .LBB64_1 15883; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15884; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 15885; GFX908-NEXT: v_mov_b32_e32 v0, v3 15886; GFX908-NEXT: s_setpc_b64 s[30:31] 15887; 15888; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15889; GFX8: ; %bb.0: 15890; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15891; GFX8-NEXT: flat_load_dword v3, v[0:1] 15892; GFX8-NEXT: s_mov_b64 s[4:5], 0 15893; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start 15894; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15895; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15896; GFX8-NEXT: v_mov_b32_e32 v4, v3 15897; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 15898; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 15899; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 15900; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 15901; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 15902; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 15903; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15904; GFX8-NEXT: buffer_wbinvl1 15905; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 15906; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15907; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 15908; GFX8-NEXT: s_cbranch_execnz .LBB64_1 15909; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15910; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 15911; GFX8-NEXT: v_mov_b32_e32 v0, v3 15912; GFX8-NEXT: s_setpc_b64 s[30:31] 15913; 15914; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: 15915; GFX7: ; %bb.0: 15916; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15917; GFX7-NEXT: flat_load_dword v5, v[0:1] 15918; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 15919; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 15920; GFX7-NEXT: s_mov_b64 s[4:5], 0 15921; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 15922; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15923; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 15924; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 15925; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 15926; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 15927; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start 15928; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15929; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 15930; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 15931; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 15932; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 15933; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 15934; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 15935; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 15936; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 15937; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 15938; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 15939; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 15940; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 15941; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc 15942; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15943; GFX7-NEXT: buffer_wbinvl1 15944; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 15945; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 15946; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 15947; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 15948; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15949; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15950; GFX7-NEXT: s_cbranch_execnz .LBB64_1 15951; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15952; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15953; GFX7-NEXT: v_mov_b32_e32 v0, v2 15954; GFX7-NEXT: v_mov_b32_e32 v1, v3 15955; GFX7-NEXT: s_setpc_b64 s[30:31] 15956 %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 15957 ret <2 x half> %result 15958} 15959 15960define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 { 15961; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 15962; GFX12: ; %bb.0: 15963; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15964; GFX12-NEXT: s_wait_expcnt 0x0 15965; GFX12-NEXT: s_wait_samplecnt 0x0 15966; GFX12-NEXT: s_wait_bvhcnt 0x0 15967; GFX12-NEXT: s_wait_kmcnt 0x0 15968; GFX12-NEXT: s_wait_storecnt 0x0 15969; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV 15970; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 15971; GFX12-NEXT: global_inv scope:SCOPE_DEV 15972; GFX12-NEXT: s_setpc_b64 s[30:31] 15973; 15974; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 15975; GFX940: ; %bb.0: 15976; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15977; GFX940-NEXT: buffer_wbl2 sc1 15978; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 15979; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15980; GFX940-NEXT: buffer_inv sc1 15981; GFX940-NEXT: s_setpc_b64 s[30:31] 15982; 15983; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 15984; GFX11: ; %bb.0: 15985; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15986; GFX11-NEXT: flat_load_b32 v4, v[0:1] 15987; GFX11-NEXT: s_mov_b32 s0, 0 15988; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start 15989; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15990; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15991; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 15992; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15993; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 15994; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15995; GFX11-NEXT: buffer_gl1_inv 15996; GFX11-NEXT: buffer_gl0_inv 15997; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 15998; GFX11-NEXT: v_mov_b32_e32 v4, v3 15999; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 16000; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16001; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 16002; GFX11-NEXT: s_cbranch_execnz .LBB65_1 16003; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16004; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 16005; GFX11-NEXT: s_setpc_b64 s[30:31] 16006; 16007; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 16008; GFX10: ; %bb.0: 16009; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16010; GFX10-NEXT: flat_load_dword v4, v[0:1] 16011; GFX10-NEXT: s_mov_b32 s4, 0 16012; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start 16013; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16014; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16015; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 16016; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16017; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16018; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16019; GFX10-NEXT: buffer_gl1_inv 16020; GFX10-NEXT: buffer_gl0_inv 16021; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 16022; GFX10-NEXT: v_mov_b32_e32 v4, v3 16023; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 16024; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 16025; GFX10-NEXT: s_cbranch_execnz .LBB65_1 16026; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16027; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 16028; GFX10-NEXT: s_setpc_b64 s[30:31] 16029; 16030; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 16031; GFX90A: ; %bb.0: 16032; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16033; GFX90A-NEXT: flat_load_dword v5, v[0:1] 16034; GFX90A-NEXT: s_mov_b64 s[4:5], 0 16035; GFX90A-NEXT: .LBB65_1: ; %atomicrmw.start 16036; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16037; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16038; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 16039; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 16040; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16041; GFX90A-NEXT: buffer_wbinvl1 16042; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 16043; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16044; GFX90A-NEXT: v_mov_b32_e32 v5, v3 16045; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 16046; GFX90A-NEXT: s_cbranch_execnz .LBB65_1 16047; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16048; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 16049; GFX90A-NEXT: s_setpc_b64 s[30:31] 16050; 16051; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 16052; GFX908: ; %bb.0: 16053; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16054; GFX908-NEXT: flat_load_dword v4, v[0:1] 16055; GFX908-NEXT: s_mov_b64 s[4:5], 0 16056; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start 16057; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16058; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16059; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 16060; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16061; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16062; GFX908-NEXT: buffer_wbinvl1 16063; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16064; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16065; GFX908-NEXT: v_mov_b32_e32 v4, v3 16066; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 16067; GFX908-NEXT: s_cbranch_execnz .LBB65_1 16068; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16069; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 16070; GFX908-NEXT: s_setpc_b64 s[30:31] 16071; 16072; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 16073; GFX8: ; %bb.0: 16074; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16075; GFX8-NEXT: flat_load_dword v4, v[0:1] 16076; GFX8-NEXT: s_mov_b64 s[4:5], 0 16077; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start 16078; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16079; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16080; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 16081; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 16082; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 16083; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 16084; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 16085; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16086; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16087; GFX8-NEXT: buffer_wbinvl1 16088; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16089; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16090; GFX8-NEXT: v_mov_b32_e32 v4, v3 16091; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 16092; GFX8-NEXT: s_cbranch_execnz .LBB65_1 16093; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16094; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 16095; GFX8-NEXT: s_setpc_b64 s[30:31] 16096; 16097; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: 16098; GFX7: ; %bb.0: 16099; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16100; GFX7-NEXT: flat_load_dword v5, v[0:1] 16101; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 16102; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 16103; GFX7-NEXT: s_mov_b64 s[4:5], 0 16104; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 16105; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16106; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 16107; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 16108; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 16109; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 16110; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start 16111; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16112; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 16113; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 16114; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 16115; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 16116; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 16117; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 16118; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 16119; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 16120; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 16121; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 16122; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 16123; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 16124; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 16125; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16126; GFX7-NEXT: buffer_wbinvl1 16127; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 16128; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 16129; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 16130; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 16131; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16132; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16133; GFX7-NEXT: s_cbranch_execnz .LBB65_1 16134; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16135; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16136; GFX7-NEXT: s_setpc_b64 s[30:31] 16137 %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 16138 ret void 16139} 16140 16141define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 { 16142; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16143; GFX12: ; %bb.0: 16144; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16145; GFX12-NEXT: s_wait_expcnt 0x0 16146; GFX12-NEXT: s_wait_samplecnt 0x0 16147; GFX12-NEXT: s_wait_bvhcnt 0x0 16148; GFX12-NEXT: s_wait_kmcnt 0x0 16149; GFX12-NEXT: s_wait_storecnt 0x0 16150; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 16151; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16152; GFX12-NEXT: global_inv scope:SCOPE_DEV 16153; GFX12-NEXT: s_setpc_b64 s[30:31] 16154; 16155; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16156; GFX940: ; %bb.0: 16157; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16158; GFX940-NEXT: buffer_wbl2 sc1 16159; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 16160; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16161; GFX940-NEXT: buffer_inv sc1 16162; GFX940-NEXT: s_setpc_b64 s[30:31] 16163; 16164; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16165; GFX11: ; %bb.0: 16166; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16167; GFX11-NEXT: flat_load_b32 v3, v[0:1] 16168; GFX11-NEXT: s_mov_b32 s0, 0 16169; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start 16170; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16171; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16172; GFX11-NEXT: v_mov_b32_e32 v4, v3 16173; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 16174; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 16175; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16176; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 16177; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16178; GFX11-NEXT: buffer_gl1_inv 16179; GFX11-NEXT: buffer_gl0_inv 16180; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 16181; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 16182; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16183; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 16184; GFX11-NEXT: s_cbranch_execnz .LBB66_1 16185; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16186; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 16187; GFX11-NEXT: v_mov_b32_e32 v0, v3 16188; GFX11-NEXT: s_setpc_b64 s[30:31] 16189; 16190; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16191; GFX10: ; %bb.0: 16192; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16193; GFX10-NEXT: flat_load_dword v3, v[0:1] 16194; GFX10-NEXT: s_mov_b32 s4, 0 16195; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start 16196; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16197; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16198; GFX10-NEXT: v_mov_b32_e32 v4, v3 16199; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 16200; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16201; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16202; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16203; GFX10-NEXT: buffer_gl1_inv 16204; GFX10-NEXT: buffer_gl0_inv 16205; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 16206; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 16207; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 16208; GFX10-NEXT: s_cbranch_execnz .LBB66_1 16209; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16210; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 16211; GFX10-NEXT: v_mov_b32_e32 v0, v3 16212; GFX10-NEXT: s_setpc_b64 s[30:31] 16213; 16214; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16215; GFX90A: ; %bb.0: 16216; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16217; GFX90A-NEXT: flat_load_dword v3, v[0:1] 16218; GFX90A-NEXT: s_mov_b64 s[4:5], 0 16219; GFX90A-NEXT: .LBB66_1: ; %atomicrmw.start 16220; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16221; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16222; GFX90A-NEXT: v_mov_b32_e32 v5, v3 16223; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 16224; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 16225; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16226; GFX90A-NEXT: buffer_wbinvl1 16227; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 16228; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16229; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 16230; GFX90A-NEXT: s_cbranch_execnz .LBB66_1 16231; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16232; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 16233; GFX90A-NEXT: v_mov_b32_e32 v0, v3 16234; GFX90A-NEXT: s_setpc_b64 s[30:31] 16235; 16236; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16237; GFX908: ; %bb.0: 16238; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16239; GFX908-NEXT: flat_load_dword v3, v[0:1] 16240; GFX908-NEXT: s_mov_b64 s[4:5], 0 16241; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start 16242; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16243; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16244; GFX908-NEXT: v_mov_b32_e32 v4, v3 16245; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 16246; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16247; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16248; GFX908-NEXT: buffer_wbinvl1 16249; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16250; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16251; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 16252; GFX908-NEXT: s_cbranch_execnz .LBB66_1 16253; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16254; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 16255; GFX908-NEXT: v_mov_b32_e32 v0, v3 16256; GFX908-NEXT: s_setpc_b64 s[30:31] 16257; 16258; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16259; GFX8: ; %bb.0: 16260; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16261; GFX8-NEXT: flat_load_dword v3, v[0:1] 16262; GFX8-NEXT: s_mov_b64 s[4:5], 0 16263; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start 16264; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16265; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16266; GFX8-NEXT: v_mov_b32_e32 v4, v3 16267; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 16268; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 16269; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 16270; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 16271; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 16272; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16273; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16274; GFX8-NEXT: buffer_wbinvl1 16275; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16276; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16277; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 16278; GFX8-NEXT: s_cbranch_execnz .LBB66_1 16279; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16280; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 16281; GFX8-NEXT: v_mov_b32_e32 v0, v3 16282; GFX8-NEXT: s_setpc_b64 s[30:31] 16283; 16284; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16285; GFX7: ; %bb.0: 16286; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16287; GFX7-NEXT: flat_load_dword v5, v[0:1] 16288; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 16289; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 16290; GFX7-NEXT: s_mov_b64 s[4:5], 0 16291; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 16292; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16293; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 16294; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 16295; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 16296; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 16297; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start 16298; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16299; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 16300; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 16301; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 16302; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 16303; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 16304; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 16305; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 16306; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 16307; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 16308; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 16309; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 16310; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 16311; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc 16312; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16313; GFX7-NEXT: buffer_wbinvl1 16314; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 16315; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 16316; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 16317; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 16318; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16319; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16320; GFX7-NEXT: s_cbranch_execnz .LBB66_1 16321; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16322; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16323; GFX7-NEXT: v_mov_b32_e32 v0, v2 16324; GFX7-NEXT: v_mov_b32_e32 v1, v3 16325; GFX7-NEXT: s_setpc_b64 s[30:31] 16326 %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 16327 ret <2 x half> %result 16328} 16329 16330define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 { 16331; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16332; GFX12: ; %bb.0: 16333; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16334; GFX12-NEXT: s_wait_expcnt 0x0 16335; GFX12-NEXT: s_wait_samplecnt 0x0 16336; GFX12-NEXT: s_wait_bvhcnt 0x0 16337; GFX12-NEXT: s_wait_kmcnt 0x0 16338; GFX12-NEXT: s_wait_storecnt 0x0 16339; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV 16340; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 16341; GFX12-NEXT: global_inv scope:SCOPE_DEV 16342; GFX12-NEXT: s_setpc_b64 s[30:31] 16343; 16344; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16345; GFX940: ; %bb.0: 16346; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16347; GFX940-NEXT: buffer_wbl2 sc1 16348; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 16349; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16350; GFX940-NEXT: buffer_inv sc1 16351; GFX940-NEXT: s_setpc_b64 s[30:31] 16352; 16353; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16354; GFX11: ; %bb.0: 16355; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16356; GFX11-NEXT: flat_load_b32 v4, v[0:1] 16357; GFX11-NEXT: s_mov_b32 s0, 0 16358; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start 16359; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16360; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16361; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 16362; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16363; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 16364; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16365; GFX11-NEXT: buffer_gl1_inv 16366; GFX11-NEXT: buffer_gl0_inv 16367; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 16368; GFX11-NEXT: v_mov_b32_e32 v4, v3 16369; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 16370; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16371; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 16372; GFX11-NEXT: s_cbranch_execnz .LBB67_1 16373; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16374; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 16375; GFX11-NEXT: s_setpc_b64 s[30:31] 16376; 16377; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16378; GFX10: ; %bb.0: 16379; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16380; GFX10-NEXT: flat_load_dword v4, v[0:1] 16381; GFX10-NEXT: s_mov_b32 s4, 0 16382; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start 16383; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16384; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16385; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 16386; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16387; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16388; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16389; GFX10-NEXT: buffer_gl1_inv 16390; GFX10-NEXT: buffer_gl0_inv 16391; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 16392; GFX10-NEXT: v_mov_b32_e32 v4, v3 16393; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 16394; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 16395; GFX10-NEXT: s_cbranch_execnz .LBB67_1 16396; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16397; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 16398; GFX10-NEXT: s_setpc_b64 s[30:31] 16399; 16400; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16401; GFX90A: ; %bb.0: 16402; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16403; GFX90A-NEXT: flat_load_dword v5, v[0:1] 16404; GFX90A-NEXT: s_mov_b64 s[4:5], 0 16405; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start 16406; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16407; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16408; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 16409; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 16410; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16411; GFX90A-NEXT: buffer_wbinvl1 16412; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 16413; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16414; GFX90A-NEXT: v_mov_b32_e32 v5, v3 16415; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 16416; GFX90A-NEXT: s_cbranch_execnz .LBB67_1 16417; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16418; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 16419; GFX90A-NEXT: s_setpc_b64 s[30:31] 16420; 16421; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16422; GFX908: ; %bb.0: 16423; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16424; GFX908-NEXT: flat_load_dword v4, v[0:1] 16425; GFX908-NEXT: s_mov_b64 s[4:5], 0 16426; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start 16427; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16428; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16429; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 16430; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16431; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16432; GFX908-NEXT: buffer_wbinvl1 16433; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16434; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16435; GFX908-NEXT: v_mov_b32_e32 v4, v3 16436; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 16437; GFX908-NEXT: s_cbranch_execnz .LBB67_1 16438; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16439; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 16440; GFX908-NEXT: s_setpc_b64 s[30:31] 16441; 16442; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16443; GFX8: ; %bb.0: 16444; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16445; GFX8-NEXT: flat_load_dword v4, v[0:1] 16446; GFX8-NEXT: s_mov_b64 s[4:5], 0 16447; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start 16448; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16449; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16450; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 16451; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 16452; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 16453; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 16454; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 16455; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 16456; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16457; GFX8-NEXT: buffer_wbinvl1 16458; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 16459; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16460; GFX8-NEXT: v_mov_b32_e32 v4, v3 16461; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 16462; GFX8-NEXT: s_cbranch_execnz .LBB67_1 16463; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16464; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 16465; GFX8-NEXT: s_setpc_b64 s[30:31] 16466; 16467; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 16468; GFX7: ; %bb.0: 16469; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16470; GFX7-NEXT: flat_load_dword v5, v[0:1] 16471; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 16472; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 16473; GFX7-NEXT: s_mov_b64 s[4:5], 0 16474; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 16475; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16476; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 16477; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 16478; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 16479; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 16480; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start 16481; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16482; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 16483; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 16484; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 16485; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 16486; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 16487; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 16488; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 16489; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 16490; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 16491; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 16492; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 16493; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 16494; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 16495; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16496; GFX7-NEXT: buffer_wbinvl1 16497; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 16498; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 16499; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 16500; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 16501; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16502; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16503; GFX7-NEXT: s_cbranch_execnz .LBB67_1 16504; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16505; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16506; GFX7-NEXT: s_setpc_b64 s[30:31] 16507 %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 16508 ret void 16509} 16510 16511; -------------------------------------------------------------------- 16512; <2 x bfloat> 16513; -------------------------------------------------------------------- 16514 16515define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 16516; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16517; GFX12: ; %bb.0: 16518; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16519; GFX12-NEXT: s_wait_expcnt 0x0 16520; GFX12-NEXT: s_wait_samplecnt 0x0 16521; GFX12-NEXT: s_wait_bvhcnt 0x0 16522; GFX12-NEXT: s_wait_kmcnt 0x0 16523; GFX12-NEXT: s_wait_storecnt 0x0 16524; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 16525; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16526; GFX12-NEXT: global_inv scope:SCOPE_DEV 16527; GFX12-NEXT: s_setpc_b64 s[30:31] 16528; 16529; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16530; GFX940: ; %bb.0: 16531; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16532; GFX940-NEXT: buffer_wbl2 sc1 16533; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 16534; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16535; GFX940-NEXT: buffer_inv sc1 16536; GFX940-NEXT: s_setpc_b64 s[30:31] 16537; 16538; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16539; GFX11: ; %bb.0: 16540; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16541; GFX11-NEXT: flat_load_b32 v3, v[0:1] 16542; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16543; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16544; GFX11-NEXT: s_mov_b32 s1, 0 16545; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16546; GFX11-NEXT: .p2align 6 16547; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start 16548; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16549; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16550; GFX11-NEXT: v_mov_b32_e32 v6, v3 16551; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16552; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16553; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 16554; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16555; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16556; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 16557; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 16558; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 16559; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16560; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16561; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16562; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 16563; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 16564; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 16565; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16566; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16567; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 16568; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16569; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 16570; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 16571; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16572; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc 16573; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16574; GFX11-NEXT: buffer_gl1_inv 16575; GFX11-NEXT: buffer_gl0_inv 16576; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 16577; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16578; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16579; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16580; GFX11-NEXT: s_cbranch_execnz .LBB68_1 16581; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16582; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16583; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16584; GFX11-NEXT: v_mov_b32_e32 v0, v3 16585; GFX11-NEXT: s_setpc_b64 s[30:31] 16586; 16587; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16588; GFX10: ; %bb.0: 16589; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16590; GFX10-NEXT: flat_load_dword v3, v[0:1] 16591; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16592; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16593; GFX10-NEXT: s_mov_b32 s5, 0 16594; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start 16595; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16596; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16597; GFX10-NEXT: v_mov_b32_e32 v6, v3 16598; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16599; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16600; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 16601; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 16602; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 16603; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 16604; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 16605; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 16606; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16607; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 16608; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16609; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 16610; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16611; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 16612; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 16613; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16614; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 16615; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16616; GFX10-NEXT: buffer_gl1_inv 16617; GFX10-NEXT: buffer_gl0_inv 16618; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 16619; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16620; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16621; GFX10-NEXT: s_cbranch_execnz .LBB68_1 16622; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16623; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16624; GFX10-NEXT: v_mov_b32_e32 v0, v3 16625; GFX10-NEXT: s_setpc_b64 s[30:31] 16626; 16627; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16628; GFX90A: ; %bb.0: 16629; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16630; GFX90A-NEXT: flat_load_dword v3, v[0:1] 16631; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16632; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16633; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16634; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16635; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16636; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start 16637; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16638; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16639; GFX90A-NEXT: v_mov_b32_e32 v7, v3 16640; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 16641; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 16642; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 16643; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 16644; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 16645; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 16646; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 16647; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 16648; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 16649; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 16650; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16651; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16652; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 16653; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16654; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 16655; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc 16656; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16657; GFX90A-NEXT: buffer_wbinvl1 16658; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 16659; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16660; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16661; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 16662; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16663; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16664; GFX90A-NEXT: v_mov_b32_e32 v0, v3 16665; GFX90A-NEXT: s_setpc_b64 s[30:31] 16666; 16667; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16668; GFX908: ; %bb.0: 16669; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16670; GFX908-NEXT: flat_load_dword v3, v[0:1] 16671; GFX908-NEXT: s_mov_b64 s[6:7], 0 16672; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16673; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16674; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16675; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16676; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start 16677; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16678; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16679; GFX908-NEXT: v_mov_b32_e32 v6, v3 16680; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16681; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16682; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 16683; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 16684; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 16685; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 16686; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 16687; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 16688; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 16689; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 16690; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16691; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16692; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 16693; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16694; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 16695; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 16696; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16697; GFX908-NEXT: buffer_wbinvl1 16698; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 16699; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16700; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16701; GFX908-NEXT: s_cbranch_execnz .LBB68_1 16702; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16703; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16704; GFX908-NEXT: v_mov_b32_e32 v0, v3 16705; GFX908-NEXT: s_setpc_b64 s[30:31] 16706; 16707; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16708; GFX8: ; %bb.0: 16709; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16710; GFX8-NEXT: flat_load_dword v3, v[0:1] 16711; GFX8-NEXT: s_mov_b64 s[6:7], 0 16712; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16713; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16714; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start 16715; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16716; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16717; GFX8-NEXT: v_mov_b32_e32 v6, v3 16718; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16719; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16720; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 16721; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 16722; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 16723; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 16724; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 16725; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 16726; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16727; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16728; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 16729; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16730; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 16731; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16732; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16733; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 16734; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16735; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 16736; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 16737; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16738; GFX8-NEXT: buffer_wbinvl1 16739; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 16740; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16741; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16742; GFX8-NEXT: s_cbranch_execnz .LBB68_1 16743; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16744; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16745; GFX8-NEXT: v_mov_b32_e32 v0, v3 16746; GFX8-NEXT: s_setpc_b64 s[30:31] 16747; 16748; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: 16749; GFX7: ; %bb.0: 16750; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16751; GFX7-NEXT: flat_load_dword v5, v[0:1] 16752; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 16753; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 16754; GFX7-NEXT: s_mov_b64 s[4:5], 0 16755; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 16756; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16757; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 16758; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 16759; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16760; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start 16761; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16762; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 16763; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 16764; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 16765; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16766; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 16767; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 16768; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 16769; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 16770; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 16771; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 16772; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc 16773; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16774; GFX7-NEXT: buffer_wbinvl1 16775; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 16776; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 16777; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16778; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16779; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16780; GFX7-NEXT: s_cbranch_execnz .LBB68_1 16781; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16782; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16783; GFX7-NEXT: v_mov_b32_e32 v0, v3 16784; GFX7-NEXT: v_mov_b32_e32 v1, v2 16785; GFX7-NEXT: s_setpc_b64 s[30:31] 16786 %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 16787 ret <2 x bfloat> %result 16788} 16789 16790define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 16791; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16792; GFX12: ; %bb.0: 16793; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16794; GFX12-NEXT: s_wait_expcnt 0x0 16795; GFX12-NEXT: s_wait_samplecnt 0x0 16796; GFX12-NEXT: s_wait_bvhcnt 0x0 16797; GFX12-NEXT: s_wait_kmcnt 0x0 16798; GFX12-NEXT: s_wait_storecnt 0x0 16799; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 16800; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16801; GFX12-NEXT: global_inv scope:SCOPE_DEV 16802; GFX12-NEXT: s_setpc_b64 s[30:31] 16803; 16804; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16805; GFX940: ; %bb.0: 16806; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16807; GFX940-NEXT: buffer_wbl2 sc1 16808; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 16809; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16810; GFX940-NEXT: buffer_inv sc1 16811; GFX940-NEXT: s_setpc_b64 s[30:31] 16812; 16813; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16814; GFX11: ; %bb.0: 16815; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16816; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16817; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16818; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16819; GFX11-NEXT: s_mov_b32 s1, 0 16820; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16821; GFX11-NEXT: .p2align 6 16822; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start 16823; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16824; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16825; GFX11-NEXT: v_mov_b32_e32 v6, v3 16826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16827; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16828; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 16829; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16830; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16831; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 16832; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 16833; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 16834; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16835; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16836; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16837; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 16838; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 16839; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 16840; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16841; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16842; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 16843; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16844; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 16845; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 16846; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16847; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 16848; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16849; GFX11-NEXT: buffer_gl1_inv 16850; GFX11-NEXT: buffer_gl0_inv 16851; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 16852; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16853; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16854; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16855; GFX11-NEXT: s_cbranch_execnz .LBB69_1 16856; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16857; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16858; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16859; GFX11-NEXT: v_mov_b32_e32 v0, v3 16860; GFX11-NEXT: s_setpc_b64 s[30:31] 16861; 16862; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16863; GFX10: ; %bb.0: 16864; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16865; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 16866; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 16867; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 16868; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16869; GFX10-NEXT: s_mov_b32 s5, 0 16870; GFX10-NEXT: flat_load_dword v0, v[3:4] 16871; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start 16872; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16873; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16874; GFX10-NEXT: v_mov_b32_e32 v6, v0 16875; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 16876; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16877; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 16878; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 16879; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 16880; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 16881; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 16882; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 16883; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16884; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 16885; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16886; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 16887; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16888; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 16889; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 16890; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16891; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 16892; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16893; GFX10-NEXT: buffer_gl1_inv 16894; GFX10-NEXT: buffer_gl0_inv 16895; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 16896; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16897; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16898; GFX10-NEXT: s_cbranch_execnz .LBB69_1 16899; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16900; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16901; GFX10-NEXT: s_setpc_b64 s[30:31] 16902; 16903; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16904; GFX90A: ; %bb.0: 16905; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16906; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 16907; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16908; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16909; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16910; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16911; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16912; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start 16913; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16914; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16915; GFX90A-NEXT: v_mov_b32_e32 v7, v3 16916; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 16917; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 16918; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 16919; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 16920; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 16921; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 16922; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 16923; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 16924; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 16925; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 16926; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16927; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16928; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 16929; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16930; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 16931; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 16932; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16933; GFX90A-NEXT: buffer_wbinvl1 16934; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 16935; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16936; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16937; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 16938; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16939; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16940; GFX90A-NEXT: v_mov_b32_e32 v0, v3 16941; GFX90A-NEXT: s_setpc_b64 s[30:31] 16942; 16943; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16944; GFX908: ; %bb.0: 16945; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16946; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 16947; GFX908-NEXT: s_mov_b64 s[6:7], 0 16948; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16949; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16950; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16951; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16952; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start 16953; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16954; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16955; GFX908-NEXT: v_mov_b32_e32 v6, v3 16956; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16957; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16958; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 16959; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 16960; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 16961; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 16962; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 16963; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 16964; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 16965; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 16966; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16967; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16968; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 16969; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16970; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 16971; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 16972; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16973; GFX908-NEXT: buffer_wbinvl1 16974; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 16975; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16976; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16977; GFX908-NEXT: s_cbranch_execnz .LBB69_1 16978; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16979; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16980; GFX908-NEXT: v_mov_b32_e32 v0, v3 16981; GFX908-NEXT: s_setpc_b64 s[30:31] 16982; 16983; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16984; GFX8: ; %bb.0: 16985; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16986; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 16987; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 16988; GFX8-NEXT: flat_load_dword v0, v[3:4] 16989; GFX8-NEXT: s_mov_b64 s[6:7], 0 16990; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 16991; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16992; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start 16993; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16994; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16995; GFX8-NEXT: v_mov_b32_e32 v6, v0 16996; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 16997; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16998; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 16999; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 17000; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 17001; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 17002; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 17003; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 17004; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 17005; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 17006; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 17007; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 17008; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 17009; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 17010; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 17011; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 17012; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 17013; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 17014; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 17015; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17016; GFX8-NEXT: buffer_wbinvl1 17017; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 17018; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17019; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 17020; GFX8-NEXT: s_cbranch_execnz .LBB69_1 17021; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 17022; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 17023; GFX8-NEXT: s_setpc_b64 s[30:31] 17024; 17025; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17026; GFX7: ; %bb.0: 17027; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17028; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 17029; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 17030; GFX7-NEXT: flat_load_dword v0, v[4:5] 17031; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 17032; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17033; GFX7-NEXT: s_mov_b64 s[4:5], 0 17034; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 17035; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17036; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17037; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 17038; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17039; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start 17040; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 17041; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17042; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17043; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 17044; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 17045; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 17046; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 17047; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 17048; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 17049; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 17050; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 17051; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 17052; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17053; GFX7-NEXT: buffer_wbinvl1 17054; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 17055; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 17056; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 17057; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17058; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 17059; GFX7-NEXT: s_cbranch_execnz .LBB69_1 17060; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 17061; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 17062; GFX7-NEXT: s_setpc_b64 s[30:31] 17063 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 17064 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 17065 ret <2 x bfloat> %result 17066} 17067 17068define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 17069; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17070; GFX12: ; %bb.0: 17071; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17072; GFX12-NEXT: s_wait_expcnt 0x0 17073; GFX12-NEXT: s_wait_samplecnt 0x0 17074; GFX12-NEXT: s_wait_bvhcnt 0x0 17075; GFX12-NEXT: s_wait_kmcnt 0x0 17076; GFX12-NEXT: s_wait_storecnt 0x0 17077; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 17078; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17079; GFX12-NEXT: global_inv scope:SCOPE_DEV 17080; GFX12-NEXT: s_setpc_b64 s[30:31] 17081; 17082; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17083; GFX940: ; %bb.0: 17084; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17085; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 17086; GFX940-NEXT: s_nop 1 17087; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 17088; GFX940-NEXT: buffer_wbl2 sc1 17089; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 17090; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17091; GFX940-NEXT: buffer_inv sc1 17092; GFX940-NEXT: s_setpc_b64 s[30:31] 17093; 17094; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17095; GFX11: ; %bb.0: 17096; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17097; GFX11-NEXT: v_mov_b32_e32 v3, v0 17098; GFX11-NEXT: s_mov_b32 s1, 0 17099; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17100; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 17101; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 17102; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 17103; GFX11-NEXT: flat_load_b32 v0, v[4:5] 17104; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 17105; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 17106; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17107; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 17108; GFX11-NEXT: .p2align 6 17109; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start 17110; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 17111; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17112; GFX11-NEXT: v_mov_b32_e32 v6, v0 17113; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17114; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 17115; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 17116; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 17117; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17118; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 17119; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 17120; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 17121; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 17122; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 17123; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 17124; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 17125; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 17126; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 17127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 17128; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 17129; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 17130; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17131; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 17132; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 17133; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 17134; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc 17135; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17136; GFX11-NEXT: buffer_gl1_inv 17137; GFX11-NEXT: buffer_gl0_inv 17138; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 17139; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 17140; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17141; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 17142; GFX11-NEXT: s_cbranch_execnz .LBB70_1 17143; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 17144; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 17145; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 17146; GFX11-NEXT: s_setpc_b64 s[30:31] 17147; 17148; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17149; GFX10: ; %bb.0: 17150; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17151; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 17152; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 17153; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 17154; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17155; GFX10-NEXT: s_mov_b32 s5, 0 17156; GFX10-NEXT: flat_load_dword v0, v[3:4] 17157; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start 17158; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 17159; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17160; GFX10-NEXT: v_mov_b32_e32 v6, v0 17161; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 17162; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 17163; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 17164; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 17165; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 17166; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 17167; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 17168; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 17169; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 17170; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 17171; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 17172; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 17173; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 17174; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 17175; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 17176; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 17177; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 17178; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17179; GFX10-NEXT: buffer_gl1_inv 17180; GFX10-NEXT: buffer_gl0_inv 17181; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 17182; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 17183; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 17184; GFX10-NEXT: s_cbranch_execnz .LBB70_1 17185; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 17186; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 17187; GFX10-NEXT: s_setpc_b64 s[30:31] 17188; 17189; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17190; GFX90A: ; %bb.0: 17191; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17192; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 17193; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 17194; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 17195; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 17196; GFX90A-NEXT: flat_load_dword v0, v[0:1] 17197; GFX90A-NEXT: s_mov_b64 s[6:7], 0 17198; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 17199; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 17200; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17201; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 17202; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start 17203; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 17204; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17205; GFX90A-NEXT: v_mov_b32_e32 v7, v0 17206; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 17207; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 17208; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 17209; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 17210; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 17211; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 17212; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 17213; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 17214; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 17215; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 17216; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 17217; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 17218; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] 17219; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc 17220; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 17221; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc 17222; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17223; GFX90A-NEXT: buffer_wbinvl1 17224; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 17225; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17226; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 17227; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 17228; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 17229; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 17230; GFX90A-NEXT: s_setpc_b64 s[30:31] 17231; 17232; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17233; GFX908: ; %bb.0: 17234; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17235; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 17236; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 17237; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 17238; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 17239; GFX908-NEXT: flat_load_dword v0, v[0:1] 17240; GFX908-NEXT: s_mov_b64 s[6:7], 0 17241; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 17242; GFX908-NEXT: s_movk_i32 s8, 0x7fff 17243; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17244; GFX908-NEXT: s_mov_b32 s9, 0x7060302 17245; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start 17246; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 17247; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17248; GFX908-NEXT: v_mov_b32_e32 v6, v0 17249; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 17250; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 17251; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 17252; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 17253; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 17254; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 17255; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 17256; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 17257; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 17258; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 17259; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 17260; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 17261; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 17262; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 17263; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 17264; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 17265; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17266; GFX908-NEXT: buffer_wbinvl1 17267; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 17268; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17269; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 17270; GFX908-NEXT: s_cbranch_execnz .LBB70_1 17271; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 17272; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 17273; GFX908-NEXT: s_setpc_b64 s[30:31] 17274; 17275; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17276; GFX8: ; %bb.0: 17277; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17278; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 17279; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 17280; GFX8-NEXT: flat_load_dword v0, v[3:4] 17281; GFX8-NEXT: s_mov_b64 s[6:7], 0 17282; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 17283; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17284; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start 17285; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 17286; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17287; GFX8-NEXT: v_mov_b32_e32 v6, v0 17288; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 17289; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 17290; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 17291; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 17292; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 17293; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 17294; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 17295; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 17296; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 17297; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 17298; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 17299; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 17300; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 17301; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 17302; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 17303; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 17304; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 17305; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 17306; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 17307; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17308; GFX8-NEXT: buffer_wbinvl1 17309; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 17310; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17311; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 17312; GFX8-NEXT: s_cbranch_execnz .LBB70_1 17313; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 17314; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 17315; GFX8-NEXT: s_setpc_b64 s[30:31] 17316; 17317; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17318; GFX7: ; %bb.0: 17319; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17320; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 17321; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 17322; GFX7-NEXT: flat_load_dword v0, v[4:5] 17323; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 17324; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17325; GFX7-NEXT: s_mov_b64 s[4:5], 0 17326; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 17327; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17328; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17329; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 17330; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17331; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start 17332; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 17333; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17334; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17335; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 17336; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 17337; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 17338; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 17339; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 17340; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 17341; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 17342; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 17343; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 17344; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17345; GFX7-NEXT: buffer_wbinvl1 17346; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 17347; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 17348; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 17349; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17350; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 17351; GFX7-NEXT: s_cbranch_execnz .LBB70_1 17352; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 17353; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 17354; GFX7-NEXT: s_setpc_b64 s[30:31] 17355 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 17356 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 17357 ret <2 x bfloat> %result 17358} 17359 17360define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 17361; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17362; GFX12: ; %bb.0: 17363; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17364; GFX12-NEXT: s_wait_expcnt 0x0 17365; GFX12-NEXT: s_wait_samplecnt 0x0 17366; GFX12-NEXT: s_wait_bvhcnt 0x0 17367; GFX12-NEXT: s_wait_kmcnt 0x0 17368; GFX12-NEXT: s_wait_storecnt 0x0 17369; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV 17370; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 17371; GFX12-NEXT: global_inv scope:SCOPE_DEV 17372; GFX12-NEXT: s_setpc_b64 s[30:31] 17373; 17374; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17375; GFX940: ; %bb.0: 17376; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17377; GFX940-NEXT: buffer_wbl2 sc1 17378; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 17379; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17380; GFX940-NEXT: buffer_inv sc1 17381; GFX940-NEXT: s_setpc_b64 s[30:31] 17382; 17383; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17384; GFX11: ; %bb.0: 17385; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17386; GFX11-NEXT: flat_load_b32 v3, v[0:1] 17387; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17388; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17389; GFX11-NEXT: s_mov_b32 s1, 0 17390; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 17391; GFX11-NEXT: .p2align 6 17392; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start 17393; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 17394; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17395; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17396; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17397; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17398; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 17399; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 17400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17401; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 17402; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 17403; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 17404; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 17405; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17406; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 17407; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 17408; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 17409; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17410; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 17411; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 17412; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17413; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 17414; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 17415; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 17416; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17417; GFX11-NEXT: buffer_gl1_inv 17418; GFX11-NEXT: buffer_gl0_inv 17419; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 17420; GFX11-NEXT: v_mov_b32_e32 v3, v2 17421; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 17422; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17423; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 17424; GFX11-NEXT: s_cbranch_execnz .LBB71_1 17425; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 17426; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 17427; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 17428; GFX11-NEXT: s_setpc_b64 s[30:31] 17429; 17430; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17431; GFX10: ; %bb.0: 17432; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17433; GFX10-NEXT: flat_load_dword v3, v[0:1] 17434; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17435; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17436; GFX10-NEXT: s_mov_b32 s5, 0 17437; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start 17438; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 17439; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17440; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17441; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17442; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 17443; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 17444; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 17445; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 17446; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 17447; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 17448; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17449; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 17450; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 17451; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 17452; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 17453; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 17454; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 17455; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 17456; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17457; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17458; GFX10-NEXT: buffer_gl1_inv 17459; GFX10-NEXT: buffer_gl0_inv 17460; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 17461; GFX10-NEXT: v_mov_b32_e32 v3, v2 17462; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 17463; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 17464; GFX10-NEXT: s_cbranch_execnz .LBB71_1 17465; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 17466; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 17467; GFX10-NEXT: s_setpc_b64 s[30:31] 17468; 17469; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17470; GFX90A: ; %bb.0: 17471; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17472; GFX90A-NEXT: flat_load_dword v3, v[0:1] 17473; GFX90A-NEXT: s_mov_b64 s[6:7], 0 17474; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17475; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 17476; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17477; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 17478; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start 17479; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 17480; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17481; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17482; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17483; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 17484; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 17485; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 17486; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 17487; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 17488; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 17489; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 17490; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 17491; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17492; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17493; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17494; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17495; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 17496; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17497; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17498; GFX90A-NEXT: buffer_wbinvl1 17499; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17500; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17501; GFX90A-NEXT: v_mov_b32_e32 v3, v2 17502; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 17503; GFX90A-NEXT: s_cbranch_execnz .LBB71_1 17504; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 17505; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 17506; GFX90A-NEXT: s_setpc_b64 s[30:31] 17507; 17508; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17509; GFX908: ; %bb.0: 17510; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17511; GFX908-NEXT: flat_load_dword v3, v[0:1] 17512; GFX908-NEXT: s_mov_b64 s[6:7], 0 17513; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17514; GFX908-NEXT: s_movk_i32 s8, 0x7fff 17515; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17516; GFX908-NEXT: s_mov_b32 s9, 0x7060302 17517; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start 17518; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 17519; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17520; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17521; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17522; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 17523; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 17524; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 17525; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 17526; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 17527; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 17528; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 17529; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 17530; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17531; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17532; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17533; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17534; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 17535; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17536; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17537; GFX908-NEXT: buffer_wbinvl1 17538; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17539; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17540; GFX908-NEXT: v_mov_b32_e32 v3, v2 17541; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 17542; GFX908-NEXT: s_cbranch_execnz .LBB71_1 17543; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 17544; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 17545; GFX908-NEXT: s_setpc_b64 s[30:31] 17546; 17547; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17548; GFX8: ; %bb.0: 17549; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17550; GFX8-NEXT: flat_load_dword v3, v[0:1] 17551; GFX8-NEXT: s_mov_b64 s[6:7], 0 17552; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17553; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17554; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start 17555; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 17556; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17557; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17558; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17559; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 17560; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 17561; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 17562; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 17563; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 17564; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 17565; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 17566; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 17567; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 17568; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17569; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 17570; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17571; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17572; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17573; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 17574; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 17575; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17576; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17577; GFX8-NEXT: buffer_wbinvl1 17578; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17579; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17580; GFX8-NEXT: v_mov_b32_e32 v3, v2 17581; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 17582; GFX8-NEXT: s_cbranch_execnz .LBB71_1 17583; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 17584; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 17585; GFX8-NEXT: s_setpc_b64 s[30:31] 17586; 17587; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: 17588; GFX7: ; %bb.0: 17589; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17590; GFX7-NEXT: flat_load_dword v5, v[0:1] 17591; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 17592; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17593; GFX7-NEXT: s_mov_b64 s[4:5], 0 17594; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17595; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17596; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17597; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 17598; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 17599; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start 17600; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 17601; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 17602; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 17603; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 17604; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 17605; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 17606; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 17607; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 17608; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 17609; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 17610; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 17611; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 17612; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17613; GFX7-NEXT: buffer_wbinvl1 17614; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 17615; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 17616; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 17617; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 17618; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 17619; GFX7-NEXT: s_cbranch_execnz .LBB71_1 17620; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 17621; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 17622; GFX7-NEXT: s_setpc_b64 s[30:31] 17623 %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 17624 ret void 17625} 17626 17627define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 17628; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17629; GFX12: ; %bb.0: 17630; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17631; GFX12-NEXT: s_wait_expcnt 0x0 17632; GFX12-NEXT: s_wait_samplecnt 0x0 17633; GFX12-NEXT: s_wait_bvhcnt 0x0 17634; GFX12-NEXT: s_wait_kmcnt 0x0 17635; GFX12-NEXT: s_wait_storecnt 0x0 17636; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV 17637; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 17638; GFX12-NEXT: global_inv scope:SCOPE_DEV 17639; GFX12-NEXT: s_setpc_b64 s[30:31] 17640; 17641; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17642; GFX940: ; %bb.0: 17643; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17644; GFX940-NEXT: buffer_wbl2 sc1 17645; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 17646; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17647; GFX940-NEXT: buffer_inv sc1 17648; GFX940-NEXT: s_setpc_b64 s[30:31] 17649; 17650; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17651; GFX11: ; %bb.0: 17652; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17653; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 17654; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17655; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17656; GFX11-NEXT: s_mov_b32 s1, 0 17657; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 17658; GFX11-NEXT: .p2align 6 17659; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start 17660; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 17661; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17662; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17663; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17664; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17665; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 17666; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 17667; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17668; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 17669; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 17670; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 17671; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 17672; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17673; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 17674; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 17675; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 17676; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17677; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 17678; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 17679; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17680; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 17681; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 17682; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 17683; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17684; GFX11-NEXT: buffer_gl1_inv 17685; GFX11-NEXT: buffer_gl0_inv 17686; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 17687; GFX11-NEXT: v_mov_b32_e32 v3, v2 17688; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 17689; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17690; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 17691; GFX11-NEXT: s_cbranch_execnz .LBB72_1 17692; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 17693; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 17694; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 17695; GFX11-NEXT: s_setpc_b64 s[30:31] 17696; 17697; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17698; GFX10: ; %bb.0: 17699; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17700; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 17701; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 17702; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17703; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17704; GFX10-NEXT: s_mov_b32 s5, 0 17705; GFX10-NEXT: flat_load_dword v3, v[0:1] 17706; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start 17707; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 17708; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17709; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17710; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17711; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 17712; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 17713; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 17714; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 17715; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 17716; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 17717; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17718; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 17719; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 17720; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 17721; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 17722; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 17723; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 17724; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 17725; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17726; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17727; GFX10-NEXT: buffer_gl1_inv 17728; GFX10-NEXT: buffer_gl0_inv 17729; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 17730; GFX10-NEXT: v_mov_b32_e32 v3, v2 17731; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 17732; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 17733; GFX10-NEXT: s_cbranch_execnz .LBB72_1 17734; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 17735; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 17736; GFX10-NEXT: s_setpc_b64 s[30:31] 17737; 17738; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17739; GFX90A: ; %bb.0: 17740; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17741; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 17742; GFX90A-NEXT: s_mov_b64 s[6:7], 0 17743; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17744; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 17745; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17746; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 17747; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start 17748; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 17749; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17750; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17751; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17752; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 17753; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 17754; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 17755; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 17756; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 17757; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 17758; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 17759; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 17760; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17761; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17762; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17763; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17764; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 17765; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 17766; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17767; GFX90A-NEXT: buffer_wbinvl1 17768; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17769; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17770; GFX90A-NEXT: v_mov_b32_e32 v3, v2 17771; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 17772; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 17773; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 17774; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 17775; GFX90A-NEXT: s_setpc_b64 s[30:31] 17776; 17777; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17778; GFX908: ; %bb.0: 17779; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17780; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 17781; GFX908-NEXT: s_mov_b64 s[6:7], 0 17782; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17783; GFX908-NEXT: s_movk_i32 s8, 0x7fff 17784; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17785; GFX908-NEXT: s_mov_b32 s9, 0x7060302 17786; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start 17787; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 17788; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17789; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17790; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17791; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 17792; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 17793; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 17794; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 17795; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 17796; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 17797; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 17798; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 17799; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17800; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17801; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17802; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17803; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 17804; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 17805; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17806; GFX908-NEXT: buffer_wbinvl1 17807; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17808; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17809; GFX908-NEXT: v_mov_b32_e32 v3, v2 17810; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 17811; GFX908-NEXT: s_cbranch_execnz .LBB72_1 17812; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 17813; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 17814; GFX908-NEXT: s_setpc_b64 s[30:31] 17815; 17816; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17817; GFX8: ; %bb.0: 17818; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17819; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 17820; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 17821; GFX8-NEXT: flat_load_dword v3, v[0:1] 17822; GFX8-NEXT: s_mov_b64 s[6:7], 0 17823; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17824; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17825; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start 17826; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 17827; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17828; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17829; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17830; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 17831; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 17832; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 17833; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 17834; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 17835; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 17836; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 17837; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 17838; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 17839; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 17840; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 17841; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 17842; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 17843; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 17844; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 17845; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 17846; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17847; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17848; GFX8-NEXT: buffer_wbinvl1 17849; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 17850; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 17851; GFX8-NEXT: v_mov_b32_e32 v3, v2 17852; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 17853; GFX8-NEXT: s_cbranch_execnz .LBB72_1 17854; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 17855; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 17856; GFX8-NEXT: s_setpc_b64 s[30:31] 17857; 17858; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 17859; GFX7: ; %bb.0: 17860; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17861; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 17862; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 17863; GFX7-NEXT: flat_load_dword v5, v[0:1] 17864; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 17865; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17866; GFX7-NEXT: s_mov_b64 s[4:5], 0 17867; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17868; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17869; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17870; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 17871; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 17872; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start 17873; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 17874; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 17875; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 17876; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 17877; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 17878; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 17879; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 17880; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 17881; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 17882; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 17883; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 17884; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 17885; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17886; GFX7-NEXT: buffer_wbinvl1 17887; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 17888; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 17889; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 17890; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 17891; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 17892; GFX7-NEXT: s_cbranch_execnz .LBB72_1 17893; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 17894; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 17895; GFX7-NEXT: s_setpc_b64 s[30:31] 17896 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 17897 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 17898 ret void 17899} 17900 17901define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 17902; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17903; GFX12: ; %bb.0: 17904; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17905; GFX12-NEXT: s_wait_expcnt 0x0 17906; GFX12-NEXT: s_wait_samplecnt 0x0 17907; GFX12-NEXT: s_wait_bvhcnt 0x0 17908; GFX12-NEXT: s_wait_kmcnt 0x0 17909; GFX12-NEXT: s_wait_storecnt 0x0 17910; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 17911; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 17912; GFX12-NEXT: global_inv scope:SCOPE_DEV 17913; GFX12-NEXT: s_setpc_b64 s[30:31] 17914; 17915; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17916; GFX940: ; %bb.0: 17917; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17918; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 17919; GFX940-NEXT: s_nop 1 17920; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 17921; GFX940-NEXT: buffer_wbl2 sc1 17922; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 17923; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17924; GFX940-NEXT: buffer_inv sc1 17925; GFX940-NEXT: s_setpc_b64 s[30:31] 17926; 17927; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17928; GFX11: ; %bb.0: 17929; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17930; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 17931; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 17932; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 17933; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 17934; GFX11-NEXT: flat_load_b32 v3, v[3:4] 17935; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17936; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17937; GFX11-NEXT: s_mov_b32 s1, 0 17938; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 17939; GFX11-NEXT: .p2align 6 17940; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start 17941; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 17942; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17943; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17944; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17945; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17946; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 17947; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 17948; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17949; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 17950; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 17951; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 17952; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 17953; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17954; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 17955; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 17956; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 17957; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17958; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 17959; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 17960; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17961; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 17962; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 17963; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 17964; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17965; GFX11-NEXT: buffer_gl1_inv 17966; GFX11-NEXT: buffer_gl0_inv 17967; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 17968; GFX11-NEXT: v_mov_b32_e32 v3, v2 17969; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 17970; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17971; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 17972; GFX11-NEXT: s_cbranch_execnz .LBB73_1 17973; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 17974; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 17975; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 17976; GFX11-NEXT: s_setpc_b64 s[30:31] 17977; 17978; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 17979; GFX10: ; %bb.0: 17980; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17981; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 17982; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 17983; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17984; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 17985; GFX10-NEXT: s_mov_b32 s5, 0 17986; GFX10-NEXT: flat_load_dword v3, v[0:1] 17987; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start 17988; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 17989; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17990; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 17991; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 17992; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 17993; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 17994; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 17995; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 17996; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 17997; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 17998; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 17999; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 18000; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 18001; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 18002; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 18003; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 18004; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 18005; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18006; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18007; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18008; GFX10-NEXT: buffer_gl1_inv 18009; GFX10-NEXT: buffer_gl0_inv 18010; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 18011; GFX10-NEXT: v_mov_b32_e32 v3, v2 18012; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 18013; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 18014; GFX10-NEXT: s_cbranch_execnz .LBB73_1 18015; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 18016; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 18017; GFX10-NEXT: s_setpc_b64 s[30:31] 18018; 18019; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 18020; GFX90A: ; %bb.0: 18021; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18022; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 18023; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 18024; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 18025; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 18026; GFX90A-NEXT: flat_load_dword v1, v[0:1] 18027; GFX90A-NEXT: s_mov_b64 s[6:7], 0 18028; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 18029; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 18030; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18031; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 18032; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start 18033; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 18034; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18035; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 18036; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 18037; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 18038; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 18039; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 18040; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 18041; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 18042; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 18043; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 18044; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 18045; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18046; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 18047; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 18048; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18049; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 18050; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 18051; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18052; GFX90A-NEXT: buffer_wbinvl1 18053; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 18054; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18055; GFX90A-NEXT: v_mov_b32_e32 v1, v0 18056; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 18057; GFX90A-NEXT: s_cbranch_execnz .LBB73_1 18058; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 18059; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 18060; GFX90A-NEXT: s_setpc_b64 s[30:31] 18061; 18062; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 18063; GFX908: ; %bb.0: 18064; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18065; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 18066; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 18067; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 18068; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 18069; GFX908-NEXT: flat_load_dword v1, v[0:1] 18070; GFX908-NEXT: s_mov_b64 s[6:7], 0 18071; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 18072; GFX908-NEXT: s_movk_i32 s8, 0x7fff 18073; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18074; GFX908-NEXT: s_mov_b32 s9, 0x7060302 18075; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start 18076; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 18077; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18078; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 18079; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 18080; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 18081; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 18082; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 18083; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 18084; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 18085; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 18086; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 18087; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 18088; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18089; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 18090; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 18091; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18092; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 18093; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 18094; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18095; GFX908-NEXT: buffer_wbinvl1 18096; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 18097; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18098; GFX908-NEXT: v_mov_b32_e32 v1, v0 18099; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 18100; GFX908-NEXT: s_cbranch_execnz .LBB73_1 18101; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 18102; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 18103; GFX908-NEXT: s_setpc_b64 s[30:31] 18104; 18105; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 18106; GFX8: ; %bb.0: 18107; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18108; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 18109; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 18110; GFX8-NEXT: flat_load_dword v3, v[0:1] 18111; GFX8-NEXT: s_mov_b64 s[6:7], 0 18112; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18113; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18114; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start 18115; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 18116; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18117; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18118; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18119; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 18120; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 18121; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 18122; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 18123; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 18124; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 18125; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 18126; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 18127; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 18128; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18129; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 18130; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 18131; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18132; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 18133; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 18134; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 18135; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18136; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18137; GFX8-NEXT: buffer_wbinvl1 18138; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 18139; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18140; GFX8-NEXT: v_mov_b32_e32 v3, v2 18141; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 18142; GFX8-NEXT: s_cbranch_execnz .LBB73_1 18143; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 18144; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 18145; GFX8-NEXT: s_setpc_b64 s[30:31] 18146; 18147; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 18148; GFX7: ; %bb.0: 18149; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18150; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 18151; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 18152; GFX7-NEXT: flat_load_dword v5, v[0:1] 18153; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18154; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18155; GFX7-NEXT: s_mov_b64 s[4:5], 0 18156; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18157; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18158; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18159; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 18160; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 18161; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start 18162; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 18163; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 18164; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 18165; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 18166; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 18167; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 18168; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 18169; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 18170; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 18171; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 18172; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 18173; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 18174; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18175; GFX7-NEXT: buffer_wbinvl1 18176; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 18177; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 18178; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 18179; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 18180; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 18181; GFX7-NEXT: s_cbranch_execnz .LBB73_1 18182; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 18183; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 18184; GFX7-NEXT: s_setpc_b64 s[30:31] 18185 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 18186 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 18187 ret void 18188} 18189 18190define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 18191; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18192; GFX12: ; %bb.0: 18193; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 18194; GFX12-NEXT: s_wait_expcnt 0x0 18195; GFX12-NEXT: s_wait_samplecnt 0x0 18196; GFX12-NEXT: s_wait_bvhcnt 0x0 18197; GFX12-NEXT: s_wait_kmcnt 0x0 18198; GFX12-NEXT: global_wb scope:SCOPE_SYS 18199; GFX12-NEXT: s_wait_storecnt 0x0 18200; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 18201; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 18202; GFX12-NEXT: global_inv scope:SCOPE_SYS 18203; GFX12-NEXT: s_setpc_b64 s[30:31] 18204; 18205; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18206; GFX940: ; %bb.0: 18207; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18208; GFX940-NEXT: buffer_wbl2 sc0 sc1 18209; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1 18210; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18211; GFX940-NEXT: buffer_inv sc0 sc1 18212; GFX940-NEXT: s_setpc_b64 s[30:31] 18213; 18214; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18215; GFX11: ; %bb.0: 18216; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18217; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 18218; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18219; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18220; GFX11-NEXT: s_mov_b32 s1, 0 18221; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 18222; GFX11-NEXT: .p2align 6 18223; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start 18224; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 18225; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18226; GFX11-NEXT: v_mov_b32_e32 v6, v3 18227; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 18228; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18229; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 18230; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 18232; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 18233; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 18234; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 18235; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 18237; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 18238; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 18239; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 18240; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 18241; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 18242; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 18243; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 18244; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 18245; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 18246; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 18247; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 18248; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 18249; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18250; GFX11-NEXT: buffer_gl1_inv 18251; GFX11-NEXT: buffer_gl0_inv 18252; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 18253; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 18254; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 18255; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 18256; GFX11-NEXT: s_cbranch_execnz .LBB74_1 18257; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 18258; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 18259; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 18260; GFX11-NEXT: v_mov_b32_e32 v0, v3 18261; GFX11-NEXT: s_setpc_b64 s[30:31] 18262; 18263; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18264; GFX10: ; %bb.0: 18265; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18266; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 18267; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 18268; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 18269; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18270; GFX10-NEXT: s_mov_b32 s5, 0 18271; GFX10-NEXT: flat_load_dword v0, v[3:4] 18272; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start 18273; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 18274; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18275; GFX10-NEXT: v_mov_b32_e32 v6, v0 18276; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 18277; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18278; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 18279; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 18280; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 18281; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 18282; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 18283; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 18284; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18285; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 18286; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 18287; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 18288; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 18289; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 18290; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 18291; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18292; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 18293; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18294; GFX10-NEXT: buffer_gl1_inv 18295; GFX10-NEXT: buffer_gl0_inv 18296; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 18297; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 18298; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 18299; GFX10-NEXT: s_cbranch_execnz .LBB74_1 18300; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 18301; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 18302; GFX10-NEXT: s_setpc_b64 s[30:31] 18303; 18304; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18305; GFX90A: ; %bb.0: 18306; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18307; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 18308; GFX90A-NEXT: s_mov_b64 s[6:7], 0 18309; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18310; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 18311; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18312; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 18313; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start 18314; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 18315; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18316; GFX90A-NEXT: v_mov_b32_e32 v7, v3 18317; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 18318; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 18319; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 18320; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 18321; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 18322; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 18323; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 18324; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 18325; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 18326; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 18327; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18328; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 18329; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 18330; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18331; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 18332; GFX90A-NEXT: buffer_wbl2 18333; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 18334; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18335; GFX90A-NEXT: buffer_invl2 18336; GFX90A-NEXT: buffer_wbinvl1 18337; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 18338; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18339; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 18340; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 18341; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 18342; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 18343; GFX90A-NEXT: v_mov_b32_e32 v0, v3 18344; GFX90A-NEXT: s_setpc_b64 s[30:31] 18345; 18346; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18347; GFX908: ; %bb.0: 18348; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18349; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 18350; GFX908-NEXT: s_mov_b64 s[6:7], 0 18351; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18352; GFX908-NEXT: s_movk_i32 s8, 0x7fff 18353; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18354; GFX908-NEXT: s_mov_b32 s9, 0x7060302 18355; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start 18356; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 18357; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18358; GFX908-NEXT: v_mov_b32_e32 v6, v3 18359; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18360; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18361; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 18362; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 18363; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 18364; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 18365; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 18366; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 18367; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 18368; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 18369; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18370; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 18371; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 18372; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18373; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 18374; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 18375; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18376; GFX908-NEXT: buffer_wbinvl1 18377; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 18378; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18379; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 18380; GFX908-NEXT: s_cbranch_execnz .LBB74_1 18381; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 18382; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 18383; GFX908-NEXT: v_mov_b32_e32 v0, v3 18384; GFX908-NEXT: s_setpc_b64 s[30:31] 18385; 18386; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18387; GFX8: ; %bb.0: 18388; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18389; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 18390; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 18391; GFX8-NEXT: flat_load_dword v0, v[3:4] 18392; GFX8-NEXT: s_mov_b64 s[6:7], 0 18393; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 18394; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18395; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start 18396; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 18397; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18398; GFX8-NEXT: v_mov_b32_e32 v6, v0 18399; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 18400; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18401; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 18402; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 18403; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 18404; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 18405; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 18406; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 18407; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 18408; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 18409; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 18410; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18411; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 18412; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 18413; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18414; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 18415; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 18416; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 18417; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 18418; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18419; GFX8-NEXT: buffer_wbinvl1 18420; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 18421; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18422; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 18423; GFX8-NEXT: s_cbranch_execnz .LBB74_1 18424; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 18425; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 18426; GFX8-NEXT: s_setpc_b64 s[30:31] 18427; 18428; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18429; GFX7: ; %bb.0: 18430; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18431; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 18432; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 18433; GFX7-NEXT: flat_load_dword v0, v[4:5] 18434; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 18435; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18436; GFX7-NEXT: s_mov_b64 s[4:5], 0 18437; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 18438; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18439; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18440; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 18441; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 18442; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start 18443; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 18444; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 18445; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 18446; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 18447; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 18448; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 18449; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 18450; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 18451; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 18452; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 18453; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 18454; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 18455; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18456; GFX7-NEXT: buffer_wbinvl1 18457; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 18458; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 18459; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 18460; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 18461; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 18462; GFX7-NEXT: s_cbranch_execnz .LBB74_1 18463; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 18464; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 18465; GFX7-NEXT: s_setpc_b64 s[30:31] 18466 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 18467 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 18468 ret <2 x bfloat> %result 18469} 18470 18471define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 18472; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18473; GFX12: ; %bb.0: 18474; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 18475; GFX12-NEXT: s_wait_expcnt 0x0 18476; GFX12-NEXT: s_wait_samplecnt 0x0 18477; GFX12-NEXT: s_wait_bvhcnt 0x0 18478; GFX12-NEXT: s_wait_kmcnt 0x0 18479; GFX12-NEXT: global_wb scope:SCOPE_SYS 18480; GFX12-NEXT: s_wait_storecnt 0x0 18481; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS 18482; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 18483; GFX12-NEXT: global_inv scope:SCOPE_SYS 18484; GFX12-NEXT: s_setpc_b64 s[30:31] 18485; 18486; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18487; GFX940: ; %bb.0: 18488; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18489; GFX940-NEXT: buffer_wbl2 sc0 sc1 18490; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1 18491; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18492; GFX940-NEXT: buffer_inv sc0 sc1 18493; GFX940-NEXT: s_setpc_b64 s[30:31] 18494; 18495; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18496; GFX11: ; %bb.0: 18497; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18498; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 18499; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18500; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18501; GFX11-NEXT: s_mov_b32 s1, 0 18502; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 18503; GFX11-NEXT: .p2align 6 18504; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start 18505; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 18506; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18507; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18508; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18509; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 18510; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 18511; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 18512; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 18513; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 18514; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 18515; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 18516; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 18517; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 18518; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 18519; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 18520; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 18521; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 18522; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 18523; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 18524; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 18525; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 18526; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 18527; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 18528; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18529; GFX11-NEXT: buffer_gl1_inv 18530; GFX11-NEXT: buffer_gl0_inv 18531; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 18532; GFX11-NEXT: v_mov_b32_e32 v3, v2 18533; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 18534; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 18535; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 18536; GFX11-NEXT: s_cbranch_execnz .LBB75_1 18537; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 18538; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 18539; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 18540; GFX11-NEXT: s_setpc_b64 s[30:31] 18541; 18542; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18543; GFX10: ; %bb.0: 18544; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18545; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 18546; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 18547; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18548; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18549; GFX10-NEXT: s_mov_b32 s5, 0 18550; GFX10-NEXT: flat_load_dword v3, v[0:1] 18551; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start 18552; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 18553; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18554; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18555; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18556; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 18557; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 18558; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 18559; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 18560; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 18561; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 18562; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 18563; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 18564; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 18565; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 18566; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 18567; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 18568; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 18569; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18570; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18571; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18572; GFX10-NEXT: buffer_gl1_inv 18573; GFX10-NEXT: buffer_gl0_inv 18574; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 18575; GFX10-NEXT: v_mov_b32_e32 v3, v2 18576; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 18577; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 18578; GFX10-NEXT: s_cbranch_execnz .LBB75_1 18579; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 18580; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 18581; GFX10-NEXT: s_setpc_b64 s[30:31] 18582; 18583; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18584; GFX90A: ; %bb.0: 18585; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18586; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 18587; GFX90A-NEXT: s_mov_b64 s[6:7], 0 18588; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18589; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 18590; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18591; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 18592; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start 18593; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 18594; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18595; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18596; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18597; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 18598; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 18599; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 18600; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 18601; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 18602; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 18603; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 18604; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 18605; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18606; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 18607; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 18608; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18609; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 18610; GFX90A-NEXT: buffer_wbl2 18611; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 18612; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18613; GFX90A-NEXT: buffer_invl2 18614; GFX90A-NEXT: buffer_wbinvl1 18615; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 18616; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18617; GFX90A-NEXT: v_mov_b32_e32 v3, v2 18618; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 18619; GFX90A-NEXT: s_cbranch_execnz .LBB75_1 18620; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 18621; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 18622; GFX90A-NEXT: s_setpc_b64 s[30:31] 18623; 18624; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18625; GFX908: ; %bb.0: 18626; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18627; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 18628; GFX908-NEXT: s_mov_b64 s[6:7], 0 18629; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18630; GFX908-NEXT: s_movk_i32 s8, 0x7fff 18631; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18632; GFX908-NEXT: s_mov_b32 s9, 0x7060302 18633; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start 18634; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 18635; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18636; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18637; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18638; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 18639; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 18640; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 18641; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 18642; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 18643; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 18644; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 18645; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 18646; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18647; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 18648; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 18649; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18650; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 18651; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 18652; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18653; GFX908-NEXT: buffer_wbinvl1 18654; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 18655; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18656; GFX908-NEXT: v_mov_b32_e32 v3, v2 18657; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 18658; GFX908-NEXT: s_cbranch_execnz .LBB75_1 18659; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 18660; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 18661; GFX908-NEXT: s_setpc_b64 s[30:31] 18662; 18663; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18664; GFX8: ; %bb.0: 18665; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18666; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 18667; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 18668; GFX8-NEXT: flat_load_dword v3, v[0:1] 18669; GFX8-NEXT: s_mov_b64 s[6:7], 0 18670; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18671; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 18672; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start 18673; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 18674; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18675; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 18676; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18677; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 18678; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 18679; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 18680; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 18681; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 18682; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 18683; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 18684; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 18685; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 18686; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18687; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 18688; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 18689; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18690; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 18691; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 18692; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 18693; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18694; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18695; GFX8-NEXT: buffer_wbinvl1 18696; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 18697; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18698; GFX8-NEXT: v_mov_b32_e32 v3, v2 18699; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 18700; GFX8-NEXT: s_cbranch_execnz .LBB75_1 18701; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 18702; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 18703; GFX8-NEXT: s_setpc_b64 s[30:31] 18704; 18705; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 18706; GFX7: ; %bb.0: 18707; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18708; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 18709; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 18710; GFX7-NEXT: flat_load_dword v5, v[0:1] 18711; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18712; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18713; GFX7-NEXT: s_mov_b64 s[4:5], 0 18714; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18715; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18716; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18717; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 18718; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 18719; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start 18720; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 18721; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 18722; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 18723; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 18724; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 18725; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 18726; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 18727; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 18728; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 18729; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 18730; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 18731; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 18732; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18733; GFX7-NEXT: buffer_wbinvl1 18734; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 18735; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 18736; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 18737; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 18738; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 18739; GFX7-NEXT: s_cbranch_execnz .LBB75_1 18740; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 18741; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 18742; GFX7-NEXT: s_setpc_b64 s[30:31] 18743 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 18744 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 18745 ret void 18746} 18747 18748define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 { 18749; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18750; GFX12: ; %bb.0: 18751; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 18752; GFX12-NEXT: s_wait_expcnt 0x0 18753; GFX12-NEXT: s_wait_samplecnt 0x0 18754; GFX12-NEXT: s_wait_bvhcnt 0x0 18755; GFX12-NEXT: s_wait_kmcnt 0x0 18756; GFX12-NEXT: s_wait_storecnt 0x0 18757; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 18758; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 18759; GFX12-NEXT: global_inv scope:SCOPE_DEV 18760; GFX12-NEXT: s_setpc_b64 s[30:31] 18761; 18762; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18763; GFX940: ; %bb.0: 18764; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18765; GFX940-NEXT: buffer_wbl2 sc1 18766; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 18767; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18768; GFX940-NEXT: buffer_inv sc1 18769; GFX940-NEXT: s_setpc_b64 s[30:31] 18770; 18771; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18772; GFX11: ; %bb.0: 18773; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18774; GFX11-NEXT: flat_load_b32 v3, v[0:1] 18775; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18776; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18777; GFX11-NEXT: s_mov_b32 s1, 0 18778; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 18779; GFX11-NEXT: .p2align 6 18780; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start 18781; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 18782; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18783; GFX11-NEXT: v_mov_b32_e32 v6, v3 18784; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 18785; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18786; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 18787; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18788; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 18789; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 18790; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 18791; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 18792; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18793; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 18794; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 18795; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 18796; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 18797; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 18798; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 18799; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 18800; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 18801; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 18802; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 18803; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 18804; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 18805; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc 18806; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18807; GFX11-NEXT: buffer_gl1_inv 18808; GFX11-NEXT: buffer_gl0_inv 18809; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 18810; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 18811; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 18812; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 18813; GFX11-NEXT: s_cbranch_execnz .LBB76_1 18814; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 18815; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 18816; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 18817; GFX11-NEXT: v_mov_b32_e32 v0, v3 18818; GFX11-NEXT: s_setpc_b64 s[30:31] 18819; 18820; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18821; GFX10: ; %bb.0: 18822; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18823; GFX10-NEXT: flat_load_dword v3, v[0:1] 18824; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18825; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18826; GFX10-NEXT: s_mov_b32 s5, 0 18827; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start 18828; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 18829; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18830; GFX10-NEXT: v_mov_b32_e32 v6, v3 18831; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18832; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18833; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 18834; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 18835; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 18836; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 18837; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 18838; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 18839; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18840; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 18841; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 18842; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 18843; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 18844; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 18845; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 18846; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 18847; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 18848; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18849; GFX10-NEXT: buffer_gl1_inv 18850; GFX10-NEXT: buffer_gl0_inv 18851; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 18852; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 18853; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 18854; GFX10-NEXT: s_cbranch_execnz .LBB76_1 18855; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 18856; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 18857; GFX10-NEXT: v_mov_b32_e32 v0, v3 18858; GFX10-NEXT: s_setpc_b64 s[30:31] 18859; 18860; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18861; GFX90A: ; %bb.0: 18862; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18863; GFX90A-NEXT: flat_load_dword v3, v[0:1] 18864; GFX90A-NEXT: s_mov_b64 s[6:7], 0 18865; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18866; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 18867; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18868; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 18869; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start 18870; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 18871; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18872; GFX90A-NEXT: v_mov_b32_e32 v7, v3 18873; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 18874; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 18875; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 18876; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 18877; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 18878; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 18879; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 18880; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 18881; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 18882; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 18883; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18884; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 18885; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 18886; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18887; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 18888; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc 18889; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18890; GFX90A-NEXT: buffer_wbinvl1 18891; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 18892; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18893; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 18894; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 18895; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 18896; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 18897; GFX90A-NEXT: v_mov_b32_e32 v0, v3 18898; GFX90A-NEXT: s_setpc_b64 s[30:31] 18899; 18900; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18901; GFX908: ; %bb.0: 18902; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18903; GFX908-NEXT: flat_load_dword v3, v[0:1] 18904; GFX908-NEXT: s_mov_b64 s[6:7], 0 18905; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18906; GFX908-NEXT: s_movk_i32 s8, 0x7fff 18907; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18908; GFX908-NEXT: s_mov_b32 s9, 0x7060302 18909; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start 18910; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 18911; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18912; GFX908-NEXT: v_mov_b32_e32 v6, v3 18913; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18914; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18915; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 18916; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 18917; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 18918; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 18919; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 18920; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 18921; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 18922; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 18923; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18924; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 18925; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 18926; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18927; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 18928; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 18929; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18930; GFX908-NEXT: buffer_wbinvl1 18931; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 18932; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18933; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 18934; GFX908-NEXT: s_cbranch_execnz .LBB76_1 18935; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 18936; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 18937; GFX908-NEXT: v_mov_b32_e32 v0, v3 18938; GFX908-NEXT: s_setpc_b64 s[30:31] 18939; 18940; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18941; GFX8: ; %bb.0: 18942; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18943; GFX8-NEXT: flat_load_dword v3, v[0:1] 18944; GFX8-NEXT: s_mov_b64 s[6:7], 0 18945; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 18946; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18947; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start 18948; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 18949; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18950; GFX8-NEXT: v_mov_b32_e32 v6, v3 18951; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 18952; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18953; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 18954; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 18955; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 18956; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 18957; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 18958; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 18959; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 18960; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 18961; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 18962; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18963; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 18964; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 18965; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18966; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 18967; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 18968; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 18969; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 18970; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18971; GFX8-NEXT: buffer_wbinvl1 18972; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 18973; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 18974; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 18975; GFX8-NEXT: s_cbranch_execnz .LBB76_1 18976; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 18977; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 18978; GFX8-NEXT: v_mov_b32_e32 v0, v3 18979; GFX8-NEXT: s_setpc_b64 s[30:31] 18980; 18981; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: 18982; GFX7: ; %bb.0: 18983; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18984; GFX7-NEXT: flat_load_dword v5, v[0:1] 18985; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18986; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 18987; GFX7-NEXT: s_mov_b64 s[4:5], 0 18988; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 18989; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18990; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 18991; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 18992; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 18993; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start 18994; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 18995; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18996; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18997; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 18998; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 18999; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 19000; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 19001; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 19002; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 19003; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 19004; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 19005; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc 19006; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19007; GFX7-NEXT: buffer_wbinvl1 19008; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 19009; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 19010; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 19011; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19012; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 19013; GFX7-NEXT: s_cbranch_execnz .LBB76_1 19014; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 19015; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 19016; GFX7-NEXT: v_mov_b32_e32 v0, v3 19017; GFX7-NEXT: v_mov_b32_e32 v1, v2 19018; GFX7-NEXT: s_setpc_b64 s[30:31] 19019 %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 19020 ret <2 x bfloat> %result 19021} 19022 19023define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 { 19024; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19025; GFX12: ; %bb.0: 19026; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19027; GFX12-NEXT: s_wait_expcnt 0x0 19028; GFX12-NEXT: s_wait_samplecnt 0x0 19029; GFX12-NEXT: s_wait_bvhcnt 0x0 19030; GFX12-NEXT: s_wait_kmcnt 0x0 19031; GFX12-NEXT: s_wait_storecnt 0x0 19032; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV 19033; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 19034; GFX12-NEXT: global_inv scope:SCOPE_DEV 19035; GFX12-NEXT: s_setpc_b64 s[30:31] 19036; 19037; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19038; GFX940: ; %bb.0: 19039; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19040; GFX940-NEXT: buffer_wbl2 sc1 19041; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 19042; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19043; GFX940-NEXT: buffer_inv sc1 19044; GFX940-NEXT: s_setpc_b64 s[30:31] 19045; 19046; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19047; GFX11: ; %bb.0: 19048; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19049; GFX11-NEXT: flat_load_b32 v3, v[0:1] 19050; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19051; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19052; GFX11-NEXT: s_mov_b32 s1, 0 19053; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 19054; GFX11-NEXT: .p2align 6 19055; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start 19056; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 19057; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19058; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19059; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19060; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19061; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 19062; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 19063; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19064; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 19065; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 19066; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 19067; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 19068; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19069; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 19070; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 19071; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 19072; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19073; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 19074; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 19075; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 19076; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 19077; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 19078; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 19079; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19080; GFX11-NEXT: buffer_gl1_inv 19081; GFX11-NEXT: buffer_gl0_inv 19082; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 19083; GFX11-NEXT: v_mov_b32_e32 v3, v2 19084; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 19085; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 19086; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 19087; GFX11-NEXT: s_cbranch_execnz .LBB77_1 19088; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 19089; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 19090; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 19091; GFX11-NEXT: s_setpc_b64 s[30:31] 19092; 19093; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19094; GFX10: ; %bb.0: 19095; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19096; GFX10-NEXT: flat_load_dword v3, v[0:1] 19097; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19098; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19099; GFX10-NEXT: s_mov_b32 s5, 0 19100; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start 19101; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 19102; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19103; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19104; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19105; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 19106; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 19107; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 19108; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 19109; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 19110; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 19111; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19112; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 19113; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 19114; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 19115; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 19116; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 19117; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 19118; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 19119; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19120; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19121; GFX10-NEXT: buffer_gl1_inv 19122; GFX10-NEXT: buffer_gl0_inv 19123; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 19124; GFX10-NEXT: v_mov_b32_e32 v3, v2 19125; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 19126; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 19127; GFX10-NEXT: s_cbranch_execnz .LBB77_1 19128; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 19129; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 19130; GFX10-NEXT: s_setpc_b64 s[30:31] 19131; 19132; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19133; GFX90A: ; %bb.0: 19134; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19135; GFX90A-NEXT: flat_load_dword v3, v[0:1] 19136; GFX90A-NEXT: s_mov_b64 s[6:7], 0 19137; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19138; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 19139; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19140; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 19141; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start 19142; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 19143; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19144; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19145; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19146; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 19147; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 19148; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 19149; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 19150; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 19151; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 19152; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 19153; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 19154; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19155; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19156; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19157; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19158; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 19159; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19160; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19161; GFX90A-NEXT: buffer_wbinvl1 19162; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19163; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19164; GFX90A-NEXT: v_mov_b32_e32 v3, v2 19165; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 19166; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 19167; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 19168; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 19169; GFX90A-NEXT: s_setpc_b64 s[30:31] 19170; 19171; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19172; GFX908: ; %bb.0: 19173; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19174; GFX908-NEXT: flat_load_dword v3, v[0:1] 19175; GFX908-NEXT: s_mov_b64 s[6:7], 0 19176; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19177; GFX908-NEXT: s_movk_i32 s8, 0x7fff 19178; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19179; GFX908-NEXT: s_mov_b32 s9, 0x7060302 19180; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start 19181; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 19182; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19183; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19184; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19185; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 19186; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 19187; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 19188; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 19189; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 19190; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 19191; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 19192; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 19193; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19194; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19195; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19196; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19197; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 19198; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19199; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19200; GFX908-NEXT: buffer_wbinvl1 19201; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19202; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19203; GFX908-NEXT: v_mov_b32_e32 v3, v2 19204; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 19205; GFX908-NEXT: s_cbranch_execnz .LBB77_1 19206; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 19207; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 19208; GFX908-NEXT: s_setpc_b64 s[30:31] 19209; 19210; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19211; GFX8: ; %bb.0: 19212; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19213; GFX8-NEXT: flat_load_dword v3, v[0:1] 19214; GFX8-NEXT: s_mov_b64 s[6:7], 0 19215; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19216; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19217; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start 19218; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 19219; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19220; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19221; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19222; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 19223; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 19224; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 19225; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 19226; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 19227; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 19228; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 19229; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 19230; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 19231; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19232; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 19233; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19234; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19235; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19236; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 19237; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 19238; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19239; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19240; GFX8-NEXT: buffer_wbinvl1 19241; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19242; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19243; GFX8-NEXT: v_mov_b32_e32 v3, v2 19244; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 19245; GFX8-NEXT: s_cbranch_execnz .LBB77_1 19246; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 19247; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 19248; GFX8-NEXT: s_setpc_b64 s[30:31] 19249; 19250; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: 19251; GFX7: ; %bb.0: 19252; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19253; GFX7-NEXT: flat_load_dword v5, v[0:1] 19254; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 19255; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 19256; GFX7-NEXT: s_mov_b64 s[4:5], 0 19257; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19258; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19259; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19260; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 19261; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 19262; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start 19263; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 19264; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 19265; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 19266; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 19267; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 19268; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 19269; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 19270; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 19271; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 19272; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 19273; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 19274; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 19275; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19276; GFX7-NEXT: buffer_wbinvl1 19277; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 19278; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 19279; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 19280; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 19281; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 19282; GFX7-NEXT: s_cbranch_execnz .LBB77_1 19283; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 19284; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 19285; GFX7-NEXT: s_setpc_b64 s[30:31] 19286 %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 19287 ret void 19288} 19289 19290define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 { 19291; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19292; GFX12: ; %bb.0: 19293; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19294; GFX12-NEXT: s_wait_expcnt 0x0 19295; GFX12-NEXT: s_wait_samplecnt 0x0 19296; GFX12-NEXT: s_wait_bvhcnt 0x0 19297; GFX12-NEXT: s_wait_kmcnt 0x0 19298; GFX12-NEXT: s_wait_storecnt 0x0 19299; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 19300; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19301; GFX12-NEXT: global_inv scope:SCOPE_DEV 19302; GFX12-NEXT: s_setpc_b64 s[30:31] 19303; 19304; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19305; GFX940: ; %bb.0: 19306; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19307; GFX940-NEXT: buffer_wbl2 sc1 19308; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 19309; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19310; GFX940-NEXT: buffer_inv sc1 19311; GFX940-NEXT: s_setpc_b64 s[30:31] 19312; 19313; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19314; GFX11: ; %bb.0: 19315; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19316; GFX11-NEXT: flat_load_b32 v3, v[0:1] 19317; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19318; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19319; GFX11-NEXT: s_mov_b32 s1, 0 19320; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 19321; GFX11-NEXT: .p2align 6 19322; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start 19323; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 19324; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19325; GFX11-NEXT: v_mov_b32_e32 v6, v3 19326; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 19327; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 19328; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 19329; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19330; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19331; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 19332; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 19333; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 19334; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 19335; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 19336; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 19337; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 19338; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 19339; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 19340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 19341; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 19342; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 19343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 19344; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 19345; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 19346; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 19347; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc 19348; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19349; GFX11-NEXT: buffer_gl1_inv 19350; GFX11-NEXT: buffer_gl0_inv 19351; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 19352; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 19353; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 19354; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 19355; GFX11-NEXT: s_cbranch_execnz .LBB78_1 19356; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 19357; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 19358; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 19359; GFX11-NEXT: v_mov_b32_e32 v0, v3 19360; GFX11-NEXT: s_setpc_b64 s[30:31] 19361; 19362; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19363; GFX10: ; %bb.0: 19364; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19365; GFX10-NEXT: flat_load_dword v3, v[0:1] 19366; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19367; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19368; GFX10-NEXT: s_mov_b32 s5, 0 19369; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start 19370; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 19371; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19372; GFX10-NEXT: v_mov_b32_e32 v6, v3 19373; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19374; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 19375; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 19376; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 19377; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 19378; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 19379; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 19380; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 19381; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 19382; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 19383; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 19384; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 19385; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 19386; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 19387; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 19388; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 19389; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 19390; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19391; GFX10-NEXT: buffer_gl1_inv 19392; GFX10-NEXT: buffer_gl0_inv 19393; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 19394; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 19395; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 19396; GFX10-NEXT: s_cbranch_execnz .LBB78_1 19397; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 19398; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 19399; GFX10-NEXT: v_mov_b32_e32 v0, v3 19400; GFX10-NEXT: s_setpc_b64 s[30:31] 19401; 19402; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19403; GFX90A: ; %bb.0: 19404; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19405; GFX90A-NEXT: flat_load_dword v3, v[0:1] 19406; GFX90A-NEXT: s_mov_b64 s[6:7], 0 19407; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19408; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 19409; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19410; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 19411; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start 19412; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 19413; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19414; GFX90A-NEXT: v_mov_b32_e32 v7, v3 19415; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 19416; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 19417; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 19418; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 19419; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 19420; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 19421; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 19422; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 19423; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 19424; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 19425; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 19426; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 19427; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 19428; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 19429; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 19430; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc 19431; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19432; GFX90A-NEXT: buffer_wbinvl1 19433; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 19434; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19435; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 19436; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 19437; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 19438; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 19439; GFX90A-NEXT: v_mov_b32_e32 v0, v3 19440; GFX90A-NEXT: s_setpc_b64 s[30:31] 19441; 19442; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19443; GFX908: ; %bb.0: 19444; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19445; GFX908-NEXT: flat_load_dword v3, v[0:1] 19446; GFX908-NEXT: s_mov_b64 s[6:7], 0 19447; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19448; GFX908-NEXT: s_movk_i32 s8, 0x7fff 19449; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19450; GFX908-NEXT: s_mov_b32 s9, 0x7060302 19451; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start 19452; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 19453; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19454; GFX908-NEXT: v_mov_b32_e32 v6, v3 19455; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19456; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 19457; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 19458; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 19459; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 19460; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 19461; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 19462; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 19463; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 19464; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 19465; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 19466; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 19467; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 19468; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 19469; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 19470; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 19471; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19472; GFX908-NEXT: buffer_wbinvl1 19473; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 19474; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19475; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 19476; GFX908-NEXT: s_cbranch_execnz .LBB78_1 19477; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 19478; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 19479; GFX908-NEXT: v_mov_b32_e32 v0, v3 19480; GFX908-NEXT: s_setpc_b64 s[30:31] 19481; 19482; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19483; GFX8: ; %bb.0: 19484; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19485; GFX8-NEXT: flat_load_dword v3, v[0:1] 19486; GFX8-NEXT: s_mov_b64 s[6:7], 0 19487; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19488; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19489; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start 19490; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 19491; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19492; GFX8-NEXT: v_mov_b32_e32 v6, v3 19493; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19494; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 19495; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 19496; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 19497; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 19498; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 19499; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 19500; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 19501; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 19502; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 19503; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 19504; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 19505; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 19506; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 19507; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 19508; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 19509; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 19510; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 19511; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 19512; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19513; GFX8-NEXT: buffer_wbinvl1 19514; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 19515; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19516; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 19517; GFX8-NEXT: s_cbranch_execnz .LBB78_1 19518; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 19519; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 19520; GFX8-NEXT: v_mov_b32_e32 v0, v3 19521; GFX8-NEXT: s_setpc_b64 s[30:31] 19522; 19523; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19524; GFX7: ; %bb.0: 19525; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19526; GFX7-NEXT: flat_load_dword v5, v[0:1] 19527; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 19528; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 19529; GFX7-NEXT: s_mov_b64 s[4:5], 0 19530; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 19531; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19532; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 19533; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 19534; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 19535; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start 19536; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 19537; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 19538; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 19539; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 19540; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19541; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 19542; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 19543; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 19544; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 19545; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 19546; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 19547; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc 19548; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19549; GFX7-NEXT: buffer_wbinvl1 19550; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 19551; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 19552; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 19553; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 19554; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 19555; GFX7-NEXT: s_cbranch_execnz .LBB78_1 19556; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 19557; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 19558; GFX7-NEXT: v_mov_b32_e32 v0, v3 19559; GFX7-NEXT: v_mov_b32_e32 v1, v2 19560; GFX7-NEXT: s_setpc_b64 s[30:31] 19561 %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 19562 ret <2 x bfloat> %result 19563} 19564 19565define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 { 19566; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19567; GFX12: ; %bb.0: 19568; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19569; GFX12-NEXT: s_wait_expcnt 0x0 19570; GFX12-NEXT: s_wait_samplecnt 0x0 19571; GFX12-NEXT: s_wait_bvhcnt 0x0 19572; GFX12-NEXT: s_wait_kmcnt 0x0 19573; GFX12-NEXT: s_wait_storecnt 0x0 19574; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV 19575; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 19576; GFX12-NEXT: global_inv scope:SCOPE_DEV 19577; GFX12-NEXT: s_setpc_b64 s[30:31] 19578; 19579; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19580; GFX940: ; %bb.0: 19581; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19582; GFX940-NEXT: buffer_wbl2 sc1 19583; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 19584; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19585; GFX940-NEXT: buffer_inv sc1 19586; GFX940-NEXT: s_setpc_b64 s[30:31] 19587; 19588; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19589; GFX11: ; %bb.0: 19590; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19591; GFX11-NEXT: flat_load_b32 v3, v[0:1] 19592; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19593; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19594; GFX11-NEXT: s_mov_b32 s1, 0 19595; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 19596; GFX11-NEXT: .p2align 6 19597; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start 19598; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 19599; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19600; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19601; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19602; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19603; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 19604; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 19605; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19606; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 19607; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 19608; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 19609; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 19610; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19611; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 19612; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 19613; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 19614; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 19615; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 19616; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 19617; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 19618; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 19619; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 19620; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 19621; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19622; GFX11-NEXT: buffer_gl1_inv 19623; GFX11-NEXT: buffer_gl0_inv 19624; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 19625; GFX11-NEXT: v_mov_b32_e32 v3, v2 19626; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 19627; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 19628; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 19629; GFX11-NEXT: s_cbranch_execnz .LBB79_1 19630; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 19631; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 19632; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 19633; GFX11-NEXT: s_setpc_b64 s[30:31] 19634; 19635; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19636; GFX10: ; %bb.0: 19637; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19638; GFX10-NEXT: flat_load_dword v3, v[0:1] 19639; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19640; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19641; GFX10-NEXT: s_mov_b32 s5, 0 19642; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start 19643; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 19644; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19645; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19646; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19647; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 19648; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 19649; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 19650; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 19651; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 19652; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 19653; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19654; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 19655; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 19656; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 19657; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 19658; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 19659; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 19660; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 19661; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19662; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19663; GFX10-NEXT: buffer_gl1_inv 19664; GFX10-NEXT: buffer_gl0_inv 19665; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 19666; GFX10-NEXT: v_mov_b32_e32 v3, v2 19667; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 19668; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 19669; GFX10-NEXT: s_cbranch_execnz .LBB79_1 19670; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 19671; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 19672; GFX10-NEXT: s_setpc_b64 s[30:31] 19673; 19674; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19675; GFX90A: ; %bb.0: 19676; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19677; GFX90A-NEXT: flat_load_dword v3, v[0:1] 19678; GFX90A-NEXT: s_mov_b64 s[6:7], 0 19679; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19680; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 19681; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19682; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 19683; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start 19684; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 19685; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19686; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19687; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19688; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 19689; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 19690; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 19691; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 19692; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 19693; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 19694; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 19695; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 19696; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19697; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19698; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19699; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19700; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 19701; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19702; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19703; GFX90A-NEXT: buffer_wbinvl1 19704; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19705; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19706; GFX90A-NEXT: v_mov_b32_e32 v3, v2 19707; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 19708; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 19709; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 19710; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 19711; GFX90A-NEXT: s_setpc_b64 s[30:31] 19712; 19713; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19714; GFX908: ; %bb.0: 19715; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19716; GFX908-NEXT: flat_load_dword v3, v[0:1] 19717; GFX908-NEXT: s_mov_b64 s[6:7], 0 19718; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19719; GFX908-NEXT: s_movk_i32 s8, 0x7fff 19720; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19721; GFX908-NEXT: s_mov_b32 s9, 0x7060302 19722; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start 19723; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 19724; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19725; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19726; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19727; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 19728; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 19729; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 19730; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 19731; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 19732; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 19733; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 19734; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 19735; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19736; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19737; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19738; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19739; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 19740; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19741; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19742; GFX908-NEXT: buffer_wbinvl1 19743; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19744; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19745; GFX908-NEXT: v_mov_b32_e32 v3, v2 19746; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 19747; GFX908-NEXT: s_cbranch_execnz .LBB79_1 19748; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 19749; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 19750; GFX908-NEXT: s_setpc_b64 s[30:31] 19751; 19752; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19753; GFX8: ; %bb.0: 19754; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19755; GFX8-NEXT: flat_load_dword v3, v[0:1] 19756; GFX8-NEXT: s_mov_b64 s[6:7], 0 19757; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 19758; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 19759; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start 19760; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 19761; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19762; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 19763; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 19764; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 19765; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 19766; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 19767; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 19768; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 19769; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 19770; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 19771; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 19772; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 19773; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 19774; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 19775; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 19776; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 19777; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 19778; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 19779; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 19780; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19781; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19782; GFX8-NEXT: buffer_wbinvl1 19783; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 19784; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 19785; GFX8-NEXT: v_mov_b32_e32 v3, v2 19786; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 19787; GFX8-NEXT: s_cbranch_execnz .LBB79_1 19788; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 19789; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 19790; GFX8-NEXT: s_setpc_b64 s[30:31] 19791; 19792; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 19793; GFX7: ; %bb.0: 19794; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19795; GFX7-NEXT: flat_load_dword v5, v[0:1] 19796; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 19797; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 19798; GFX7-NEXT: s_mov_b64 s[4:5], 0 19799; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19800; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19801; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19802; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 19803; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 19804; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start 19805; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 19806; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 19807; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 19808; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 19809; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 19810; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 19811; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 19812; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 19813; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 19814; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 19815; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 19816; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 19817; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19818; GFX7-NEXT: buffer_wbinvl1 19819; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 19820; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 19821; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 19822; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 19823; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 19824; GFX7-NEXT: s_cbranch_execnz .LBB79_1 19825; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 19826; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 19827; GFX7-NEXT: s_setpc_b64 s[30:31] 19828 %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 19829 ret void 19830} 19831 19832attributes #0 = { nounwind } 19833attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 19834 19835!0 = !{} 19836