1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s 11 12; -------------------------------------------------------------------- 13; float 14; -------------------------------------------------------------------- 15 16define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { 17; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 18; GFX12: ; %bb.0: 19; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 20; GFX12-NEXT: s_wait_expcnt 0x0 21; GFX12-NEXT: s_wait_samplecnt 0x0 22; GFX12-NEXT: s_wait_bvhcnt 0x0 23; GFX12-NEXT: s_wait_kmcnt 0x0 24; GFX12-NEXT: v_mov_b32_e32 v1, s16 25; GFX12-NEXT: s_wait_storecnt 0x0 26; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 27; GFX12-NEXT: s_wait_loadcnt 0x0 28; GFX12-NEXT: global_inv scope:SCOPE_DEV 29; GFX12-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 32; GFX940: ; %bb.0: 33; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX940-NEXT: v_mov_b32_e32 v1, s16 35; GFX940-NEXT: buffer_wbl2 sc1 36; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 37; GFX940-NEXT: s_waitcnt vmcnt(0) 38; GFX940-NEXT: buffer_inv sc1 39; GFX940-NEXT: s_setpc_b64 s[30:31] 40; 41; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 42; GFX11: ; %bb.0: 43; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 44; GFX11-NEXT: v_mov_b32_e32 v1, s16 45; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 46; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 47; GFX11-NEXT: s_waitcnt vmcnt(0) 48; GFX11-NEXT: buffer_gl1_inv 49; GFX11-NEXT: buffer_gl0_inv 50; GFX11-NEXT: s_setpc_b64 s[30:31] 51; 52; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 53; GFX10: ; %bb.0: 54; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 55; GFX10-NEXT: v_mov_b32_e32 v2, v0 56; GFX10-NEXT: v_mov_b32_e32 v0, s20 57; GFX10-NEXT: s_add_i32 s4, s20, 0x400 58; GFX10-NEXT: v_mov_b32_e32 v3, s4 59; GFX10-NEXT: s_mov_b32 s4, 0 60; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 61; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start 62; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 63; GFX10-NEXT: s_waitcnt vmcnt(0) 64; GFX10-NEXT: v_mov_b32_e32 v5, v0 65; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 66; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 67; GFX10-NEXT: v_mov_b32_e32 v0, v4 68; GFX10-NEXT: v_mov_b32_e32 v1, v5 69; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 70; GFX10-NEXT: s_waitcnt vmcnt(0) 71; GFX10-NEXT: buffer_gl1_inv 72; GFX10-NEXT: buffer_gl0_inv 73; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 74; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 75; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 76; GFX10-NEXT: s_cbranch_execnz .LBB0_1 77; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 78; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 79; GFX10-NEXT: s_setpc_b64 s[30:31] 80; 81; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 82; GFX90A: ; %bb.0: 83; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 84; GFX90A-NEXT: v_mov_b32_e32 v1, s20 85; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 glc 86; GFX90A-NEXT: s_waitcnt vmcnt(0) 87; GFX90A-NEXT: buffer_wbinvl1 88; GFX90A-NEXT: s_setpc_b64 s[30:31] 89; 90; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 91; GFX908: ; %bb.0: 92; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; GFX908-NEXT: v_mov_b32_e32 v2, v0 94; GFX908-NEXT: v_mov_b32_e32 v0, s20 95; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 96; GFX908-NEXT: s_add_i32 s6, s20, 0x400 97; GFX908-NEXT: s_mov_b64 s[4:5], 0 98; GFX908-NEXT: v_mov_b32_e32 v3, s6 99; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 100; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 101; GFX908-NEXT: s_waitcnt vmcnt(0) 102; GFX908-NEXT: v_mov_b32_e32 v5, v0 103; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 104; GFX908-NEXT: v_mov_b32_e32 v0, v4 105; GFX908-NEXT: v_mov_b32_e32 v1, v5 106; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 107; GFX908-NEXT: s_waitcnt vmcnt(0) 108; GFX908-NEXT: buffer_wbinvl1 109; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 110; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 111; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 112; GFX908-NEXT: s_cbranch_execnz .LBB0_1 113; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 114; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 115; GFX908-NEXT: s_setpc_b64 s[30:31] 116; 117; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 118; GFX8: ; %bb.0: 119; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 120; GFX8-NEXT: v_mov_b32_e32 v2, v0 121; GFX8-NEXT: v_mov_b32_e32 v0, s20 122; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 123; GFX8-NEXT: s_add_i32 s6, s20, 0x400 124; GFX8-NEXT: s_mov_b64 s[4:5], 0 125; GFX8-NEXT: v_mov_b32_e32 v3, s6 126; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start 127; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 128; GFX8-NEXT: s_waitcnt vmcnt(0) 129; GFX8-NEXT: v_mov_b32_e32 v5, v0 130; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 131; GFX8-NEXT: v_mov_b32_e32 v0, v4 132; GFX8-NEXT: v_mov_b32_e32 v1, v5 133; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 134; GFX8-NEXT: s_waitcnt vmcnt(0) 135; GFX8-NEXT: buffer_wbinvl1 136; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 137; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 138; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 139; GFX8-NEXT: s_cbranch_execnz .LBB0_1 140; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 141; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 142; GFX8-NEXT: s_setpc_b64 s[30:31] 143; 144; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 145; GFX7: ; %bb.0: 146; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GFX7-NEXT: v_mov_b32_e32 v2, v0 148; GFX7-NEXT: v_mov_b32_e32 v0, s20 149; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 150; GFX7-NEXT: s_add_i32 s6, s20, 0x400 151; GFX7-NEXT: s_mov_b64 s[4:5], 0 152; GFX7-NEXT: v_mov_b32_e32 v3, s6 153; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start 154; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 155; GFX7-NEXT: s_waitcnt vmcnt(0) 156; GFX7-NEXT: v_mov_b32_e32 v5, v0 157; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 158; GFX7-NEXT: v_mov_b32_e32 v0, v4 159; GFX7-NEXT: v_mov_b32_e32 v1, v5 160; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 161; GFX7-NEXT: s_waitcnt vmcnt(0) 162; GFX7-NEXT: buffer_wbinvl1 163; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 164; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 165; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 166; GFX7-NEXT: s_cbranch_execnz .LBB0_1 167; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 168; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 169; GFX7-NEXT: s_setpc_b64 s[30:31] 170; 171; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 172; GFX6: ; %bb.0: 173; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX6-NEXT: v_mov_b32_e32 v2, v0 175; GFX6-NEXT: v_mov_b32_e32 v0, s20 176; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 177; GFX6-NEXT: s_add_i32 s6, s20, 0x400 178; GFX6-NEXT: s_mov_b64 s[4:5], 0 179; GFX6-NEXT: v_mov_b32_e32 v3, s6 180; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start 181; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 182; GFX6-NEXT: s_waitcnt vmcnt(0) 183; GFX6-NEXT: v_mov_b32_e32 v5, v0 184; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 185; GFX6-NEXT: s_waitcnt expcnt(0) 186; GFX6-NEXT: v_mov_b32_e32 v0, v4 187; GFX6-NEXT: v_mov_b32_e32 v1, v5 188; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 189; GFX6-NEXT: s_waitcnt vmcnt(0) 190; GFX6-NEXT: buffer_wbinvl1 191; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 192; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 193; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 194; GFX6-NEXT: s_cbranch_execnz .LBB0_1 195; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 196; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 197; GFX6-NEXT: s_waitcnt expcnt(0) 198; GFX6-NEXT: s_setpc_b64 s[30:31] 199 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 200 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 201 ret float %result 202} 203 204define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { 205; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 206; GFX12: ; %bb.0: 207; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 208; GFX12-NEXT: s_wait_expcnt 0x0 209; GFX12-NEXT: s_wait_samplecnt 0x0 210; GFX12-NEXT: s_wait_bvhcnt 0x0 211; GFX12-NEXT: s_wait_kmcnt 0x0 212; GFX12-NEXT: v_mov_b32_e32 v1, s16 213; GFX12-NEXT: s_wait_storecnt 0x0 214; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 215; GFX12-NEXT: s_wait_storecnt 0x0 216; GFX12-NEXT: global_inv scope:SCOPE_DEV 217; GFX12-NEXT: s_setpc_b64 s[30:31] 218; 219; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 220; GFX940: ; %bb.0: 221; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 222; GFX940-NEXT: v_mov_b32_e32 v1, s16 223; GFX940-NEXT: buffer_wbl2 sc1 224; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 225; GFX940-NEXT: s_waitcnt vmcnt(0) 226; GFX940-NEXT: buffer_inv sc1 227; GFX940-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 230; GFX11: ; %bb.0: 231; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX11-NEXT: v_mov_b32_e32 v1, s16 233; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 234; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 235; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 236; GFX11-NEXT: buffer_gl1_inv 237; GFX11-NEXT: buffer_gl0_inv 238; GFX11-NEXT: s_setpc_b64 s[30:31] 239; 240; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 241; GFX10: ; %bb.0: 242; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; GFX10-NEXT: v_mov_b32_e32 v1, s20 244; GFX10-NEXT: s_add_i32 s4, s20, 0x400 245; GFX10-NEXT: v_mov_b32_e32 v3, s4 246; GFX10-NEXT: s_mov_b32 s4, 0 247; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 248; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start 249; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 250; GFX10-NEXT: s_waitcnt vmcnt(0) 251; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 252; GFX10-NEXT: v_mov_b32_e32 v5, v2 253; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 254; GFX10-NEXT: v_mov_b32_e32 v4, v1 255; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 256; GFX10-NEXT: s_waitcnt vmcnt(0) 257; GFX10-NEXT: buffer_gl1_inv 258; GFX10-NEXT: buffer_gl0_inv 259; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 260; GFX10-NEXT: v_mov_b32_e32 v2, v4 261; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 262; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 263; GFX10-NEXT: s_cbranch_execnz .LBB1_1 264; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 265; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 266; GFX10-NEXT: s_setpc_b64 s[30:31] 267; 268; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 269; GFX90A: ; %bb.0: 270; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 271; GFX90A-NEXT: v_mov_b32_e32 v1, s20 272; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 273; GFX90A-NEXT: s_waitcnt vmcnt(0) 274; GFX90A-NEXT: buffer_wbinvl1 275; GFX90A-NEXT: s_setpc_b64 s[30:31] 276; 277; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 278; GFX908: ; %bb.0: 279; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GFX908-NEXT: v_mov_b32_e32 v1, s20 281; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 282; GFX908-NEXT: s_waitcnt vmcnt(0) 283; GFX908-NEXT: buffer_wbinvl1 284; GFX908-NEXT: s_setpc_b64 s[30:31] 285; 286; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 287; GFX8: ; %bb.0: 288; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 289; GFX8-NEXT: v_mov_b32_e32 v1, s20 290; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 291; GFX8-NEXT: s_add_i32 s6, s20, 0x400 292; GFX8-NEXT: s_mov_b64 s[4:5], 0 293; GFX8-NEXT: v_mov_b32_e32 v3, s6 294; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start 295; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 296; GFX8-NEXT: s_waitcnt vmcnt(0) 297; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 298; GFX8-NEXT: v_mov_b32_e32 v5, v2 299; GFX8-NEXT: v_mov_b32_e32 v4, v1 300; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 301; GFX8-NEXT: s_waitcnt vmcnt(0) 302; GFX8-NEXT: buffer_wbinvl1 303; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 304; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 305; GFX8-NEXT: v_mov_b32_e32 v2, v4 306; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 307; GFX8-NEXT: s_cbranch_execnz .LBB1_1 308; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 309; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 310; GFX8-NEXT: s_setpc_b64 s[30:31] 311; 312; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 313; GFX7: ; %bb.0: 314; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 315; GFX7-NEXT: v_mov_b32_e32 v1, s20 316; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 317; GFX7-NEXT: s_add_i32 s6, s20, 0x400 318; GFX7-NEXT: s_mov_b64 s[4:5], 0 319; GFX7-NEXT: v_mov_b32_e32 v3, s6 320; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start 321; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 322; GFX7-NEXT: s_waitcnt vmcnt(0) 323; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 324; GFX7-NEXT: v_mov_b32_e32 v5, v2 325; GFX7-NEXT: v_mov_b32_e32 v4, v1 326; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 327; GFX7-NEXT: s_waitcnt vmcnt(0) 328; GFX7-NEXT: buffer_wbinvl1 329; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 330; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 331; GFX7-NEXT: v_mov_b32_e32 v2, v4 332; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 333; GFX7-NEXT: s_cbranch_execnz .LBB1_1 334; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 335; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 336; GFX7-NEXT: s_setpc_b64 s[30:31] 337; 338; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 339; GFX6: ; %bb.0: 340; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; GFX6-NEXT: v_mov_b32_e32 v1, s20 342; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 343; GFX6-NEXT: s_add_i32 s6, s20, 0x400 344; GFX6-NEXT: s_mov_b64 s[4:5], 0 345; GFX6-NEXT: v_mov_b32_e32 v3, s6 346; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start 347; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 348; GFX6-NEXT: s_waitcnt vmcnt(0) 349; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 350; GFX6-NEXT: s_waitcnt expcnt(0) 351; GFX6-NEXT: v_mov_b32_e32 v5, v2 352; GFX6-NEXT: v_mov_b32_e32 v4, v1 353; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 354; GFX6-NEXT: s_waitcnt vmcnt(0) 355; GFX6-NEXT: buffer_wbinvl1 356; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 357; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 358; GFX6-NEXT: v_mov_b32_e32 v2, v4 359; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 360; GFX6-NEXT: s_cbranch_execnz .LBB1_1 361; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 362; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 363; GFX6-NEXT: s_waitcnt expcnt(0) 364; GFX6-NEXT: s_setpc_b64 s[30:31] 365 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 366 %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 367 ret void 368} 369 370define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 { 371; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 372; GFX12: ; %bb.0: 373; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 374; GFX12-NEXT: s_wait_expcnt 0x0 375; GFX12-NEXT: s_wait_samplecnt 0x0 376; GFX12-NEXT: s_wait_bvhcnt 0x0 377; GFX12-NEXT: s_wait_kmcnt 0x0 378; GFX12-NEXT: s_mov_b32 s1, exec_lo 379; GFX12-NEXT: s_wait_storecnt 0x0 380; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 381; GFX12-NEXT: v_readfirstlane_b32 s4, v0 382; GFX12-NEXT: v_readfirstlane_b32 s5, v1 383; GFX12-NEXT: v_readfirstlane_b32 s6, v2 384; GFX12-NEXT: v_readfirstlane_b32 s7, v3 385; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 386; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 387; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 388; GFX12-NEXT: s_wait_alu 0xfffe 389; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 390; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 391; GFX12-NEXT: s_wait_alu 0xfffe 392; GFX12-NEXT: s_and_saveexec_b32 s0, s0 393; GFX12-NEXT: s_wait_loadcnt 0x0 394; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN 395; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 396; GFX12-NEXT: ; implicit-def: $vgpr4 397; GFX12-NEXT: s_wait_alu 0xfffe 398; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 399; GFX12-NEXT: s_cbranch_execnz .LBB2_1 400; GFX12-NEXT: ; %bb.2: 401; GFX12-NEXT: s_mov_b32 exec_lo, s1 402; GFX12-NEXT: s_wait_loadcnt 0x0 403; GFX12-NEXT: v_mov_b32_e32 v0, v5 404; GFX12-NEXT: global_inv scope:SCOPE_DEV 405; GFX12-NEXT: s_wait_alu 0xfffe 406; GFX12-NEXT: s_setpc_b64 s[30:31] 407; 408; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 409; GFX940: ; %bb.0: 410; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 411; GFX940-NEXT: s_mov_b64 s[2:3], exec 412; GFX940-NEXT: buffer_wbl2 sc1 413; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 414; GFX940-NEXT: v_readfirstlane_b32 s4, v0 415; GFX940-NEXT: v_readfirstlane_b32 s5, v1 416; GFX940-NEXT: v_readfirstlane_b32 s6, v2 417; GFX940-NEXT: v_readfirstlane_b32 s7, v3 418; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 419; GFX940-NEXT: s_nop 0 420; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 421; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 422; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 423; GFX940-NEXT: s_waitcnt vmcnt(0) 424; GFX940-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 425; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 426; GFX940-NEXT: ; implicit-def: $vgpr4 427; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 428; GFX940-NEXT: s_cbranch_execnz .LBB2_1 429; GFX940-NEXT: ; %bb.2: 430; GFX940-NEXT: s_mov_b64 exec, s[2:3] 431; GFX940-NEXT: s_waitcnt vmcnt(0) 432; GFX940-NEXT: v_mov_b32_e32 v0, v5 433; GFX940-NEXT: buffer_inv sc1 434; GFX940-NEXT: s_setpc_b64 s[30:31] 435; 436; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 437; GFX11: ; %bb.0: 438; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 439; GFX11-NEXT: s_mov_b32 s1, exec_lo 440; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 441; GFX11-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 442; GFX11-NEXT: v_readfirstlane_b32 s4, v0 443; GFX11-NEXT: v_readfirstlane_b32 s5, v1 444; GFX11-NEXT: v_readfirstlane_b32 s6, v2 445; GFX11-NEXT: v_readfirstlane_b32 s7, v3 446; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 447; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 448; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 449; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 450; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 451; GFX11-NEXT: s_and_saveexec_b32 s0, s0 452; GFX11-NEXT: s_waitcnt vmcnt(0) 453; GFX11-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc 454; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 455; GFX11-NEXT: ; implicit-def: $vgpr4 456; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 457; GFX11-NEXT: s_cbranch_execnz .LBB2_1 458; GFX11-NEXT: ; %bb.2: 459; GFX11-NEXT: s_mov_b32 exec_lo, s1 460; GFX11-NEXT: s_waitcnt vmcnt(0) 461; GFX11-NEXT: v_mov_b32_e32 v0, v5 462; GFX11-NEXT: buffer_gl1_inv 463; GFX11-NEXT: buffer_gl0_inv 464; GFX11-NEXT: s_setpc_b64 s[30:31] 465; 466; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 467; GFX10: ; %bb.0: 468; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 469; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 470; GFX10-NEXT: s_mov_b32 s5, 0 471; GFX10-NEXT: s_mov_b32 s6, exec_lo 472; GFX10-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 473; GFX10-NEXT: v_readfirstlane_b32 s8, v0 474; GFX10-NEXT: v_readfirstlane_b32 s9, v1 475; GFX10-NEXT: v_readfirstlane_b32 s10, v2 476; GFX10-NEXT: v_readfirstlane_b32 s11, v3 477; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 478; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 479; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 480; GFX10-NEXT: s_and_saveexec_b32 s4, s4 481; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 482; GFX10-NEXT: ; implicit-def: $vgpr4 483; GFX10-NEXT: s_waitcnt_depctr 0xffe3 484; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 485; GFX10-NEXT: s_cbranch_execnz .LBB2_1 486; GFX10-NEXT: ; %bb.2: 487; GFX10-NEXT: s_mov_b32 exec_lo, s6 488; GFX10-NEXT: .LBB2_3: ; %atomicrmw.start 489; GFX10-NEXT: ; =>This Loop Header: Depth=1 490; GFX10-NEXT: ; Child Loop BB2_4 Depth 2 491; GFX10-NEXT: s_waitcnt vmcnt(0) 492; GFX10-NEXT: v_add_f32_e32 v7, v8, v5 493; GFX10-NEXT: s_mov_b32 s6, exec_lo 494; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 495; GFX10-NEXT: v_mov_b32_e32 v6, v7 496; GFX10-NEXT: v_mov_b32_e32 v7, v8 497; GFX10-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 498; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 499; GFX10-NEXT: v_readfirstlane_b32 s8, v0 500; GFX10-NEXT: v_readfirstlane_b32 s9, v1 501; GFX10-NEXT: v_readfirstlane_b32 s10, v2 502; GFX10-NEXT: v_readfirstlane_b32 s11, v3 503; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 504; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 505; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 506; GFX10-NEXT: s_and_saveexec_b32 s4, s4 507; GFX10-NEXT: s_waitcnt vmcnt(0) 508; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 509; GFX10-NEXT: s_waitcnt_depctr 0xffe3 510; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 511; GFX10-NEXT: s_cbranch_execnz .LBB2_4 512; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 513; GFX10-NEXT: s_mov_b32 exec_lo, s6 514; GFX10-NEXT: s_waitcnt vmcnt(0) 515; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 516; GFX10-NEXT: v_mov_b32_e32 v8, v6 517; GFX10-NEXT: buffer_gl1_inv 518; GFX10-NEXT: buffer_gl0_inv 519; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 520; GFX10-NEXT: s_waitcnt_depctr 0xffe3 521; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 522; GFX10-NEXT: s_cbranch_execnz .LBB2_3 523; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 524; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 525; GFX10-NEXT: v_mov_b32_e32 v0, v6 526; GFX10-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 529; GFX90A: ; %bb.0: 530; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX90A-NEXT: s_mov_b64 s[6:7], exec 532; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 533; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 534; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 535; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 536; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 537; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 538; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 539; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 540; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 541; GFX90A-NEXT: s_waitcnt vmcnt(0) 542; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc 543; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 544; GFX90A-NEXT: ; implicit-def: $vgpr4 545; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 546; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 547; GFX90A-NEXT: ; %bb.2: 548; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 549; GFX90A-NEXT: s_waitcnt vmcnt(0) 550; GFX90A-NEXT: v_mov_b32_e32 v0, v5 551; GFX90A-NEXT: buffer_wbinvl1 552; GFX90A-NEXT: s_setpc_b64 s[30:31] 553; 554; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 555; GFX908: ; %bb.0: 556; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 558; GFX908-NEXT: s_mov_b64 s[6:7], exec 559; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 560; GFX908-NEXT: v_readfirstlane_b32 s8, v0 561; GFX908-NEXT: v_readfirstlane_b32 s9, v1 562; GFX908-NEXT: v_readfirstlane_b32 s10, v2 563; GFX908-NEXT: v_readfirstlane_b32 s11, v3 564; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 565; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 566; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 567; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 568; GFX908-NEXT: s_nop 0 569; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 570; GFX908-NEXT: ; implicit-def: $vgpr4 571; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 572; GFX908-NEXT: s_cbranch_execnz .LBB2_1 573; GFX908-NEXT: ; %bb.2: 574; GFX908-NEXT: s_mov_b64 exec, s[6:7] 575; GFX908-NEXT: s_mov_b64 s[6:7], 0 576; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start 577; GFX908-NEXT: ; =>This Loop Header: Depth=1 578; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 579; GFX908-NEXT: s_waitcnt vmcnt(0) 580; GFX908-NEXT: v_add_f32_e32 v7, v8, v5 581; GFX908-NEXT: v_mov_b32_e32 v6, v7 582; GFX908-NEXT: s_mov_b64 s[12:13], exec 583; GFX908-NEXT: v_mov_b32_e32 v7, v8 584; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 585; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 586; GFX908-NEXT: v_readfirstlane_b32 s8, v0 587; GFX908-NEXT: v_readfirstlane_b32 s9, v1 588; GFX908-NEXT: v_readfirstlane_b32 s10, v2 589; GFX908-NEXT: v_readfirstlane_b32 s11, v3 590; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 591; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 592; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 593; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 594; GFX908-NEXT: s_waitcnt vmcnt(0) 595; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 596; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 597; GFX908-NEXT: s_cbranch_execnz .LBB2_4 598; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 599; GFX908-NEXT: s_mov_b64 exec, s[12:13] 600; GFX908-NEXT: s_waitcnt vmcnt(0) 601; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 602; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 603; GFX908-NEXT: v_mov_b32_e32 v8, v6 604; GFX908-NEXT: buffer_wbinvl1 605; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 606; GFX908-NEXT: s_cbranch_execnz .LBB2_3 607; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 608; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 609; GFX908-NEXT: v_mov_b32_e32 v0, v6 610; GFX908-NEXT: s_setpc_b64 s[30:31] 611; 612; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 613; GFX8: ; %bb.0: 614; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 615; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 616; GFX8-NEXT: s_mov_b64 s[6:7], exec 617; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 618; GFX8-NEXT: v_readfirstlane_b32 s8, v0 619; GFX8-NEXT: v_readfirstlane_b32 s9, v1 620; GFX8-NEXT: v_readfirstlane_b32 s10, v2 621; GFX8-NEXT: v_readfirstlane_b32 s11, v3 622; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 623; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 624; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 625; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 626; GFX8-NEXT: s_nop 0 627; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 628; GFX8-NEXT: ; implicit-def: $vgpr4 629; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 630; GFX8-NEXT: s_cbranch_execnz .LBB2_1 631; GFX8-NEXT: ; %bb.2: 632; GFX8-NEXT: s_mov_b64 exec, s[6:7] 633; GFX8-NEXT: s_mov_b64 s[6:7], 0 634; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start 635; GFX8-NEXT: ; =>This Loop Header: Depth=1 636; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 637; GFX8-NEXT: s_waitcnt vmcnt(0) 638; GFX8-NEXT: v_add_f32_e32 v7, v8, v5 639; GFX8-NEXT: v_mov_b32_e32 v6, v7 640; GFX8-NEXT: s_mov_b64 s[12:13], exec 641; GFX8-NEXT: v_mov_b32_e32 v7, v8 642; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 643; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 644; GFX8-NEXT: v_readfirstlane_b32 s8, v0 645; GFX8-NEXT: v_readfirstlane_b32 s9, v1 646; GFX8-NEXT: v_readfirstlane_b32 s10, v2 647; GFX8-NEXT: v_readfirstlane_b32 s11, v3 648; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 649; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 650; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 651; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 652; GFX8-NEXT: s_waitcnt vmcnt(0) 653; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 654; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 655; GFX8-NEXT: s_cbranch_execnz .LBB2_4 656; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 657; GFX8-NEXT: s_mov_b64 exec, s[12:13] 658; GFX8-NEXT: s_waitcnt vmcnt(0) 659; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 660; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 661; GFX8-NEXT: v_mov_b32_e32 v8, v6 662; GFX8-NEXT: buffer_wbinvl1 663; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 664; GFX8-NEXT: s_cbranch_execnz .LBB2_3 665; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 666; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 667; GFX8-NEXT: v_mov_b32_e32 v0, v6 668; GFX8-NEXT: s_setpc_b64 s[30:31] 669; 670; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 671; GFX7: ; %bb.0: 672; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 673; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 674; GFX7-NEXT: s_mov_b64 s[6:7], exec 675; GFX7-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 676; GFX7-NEXT: v_readfirstlane_b32 s8, v0 677; GFX7-NEXT: v_readfirstlane_b32 s9, v1 678; GFX7-NEXT: v_readfirstlane_b32 s10, v2 679; GFX7-NEXT: v_readfirstlane_b32 s11, v3 680; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 681; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 682; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 683; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 684; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 685; GFX7-NEXT: ; implicit-def: $vgpr4 686; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 687; GFX7-NEXT: s_cbranch_execnz .LBB2_1 688; GFX7-NEXT: ; %bb.2: 689; GFX7-NEXT: s_mov_b64 exec, s[6:7] 690; GFX7-NEXT: s_mov_b64 s[6:7], 0 691; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start 692; GFX7-NEXT: ; =>This Loop Header: Depth=1 693; GFX7-NEXT: ; Child Loop BB2_4 Depth 2 694; GFX7-NEXT: s_waitcnt vmcnt(0) 695; GFX7-NEXT: v_add_f32_e32 v7, v8, v5 696; GFX7-NEXT: v_mov_b32_e32 v6, v7 697; GFX7-NEXT: s_mov_b64 s[12:13], exec 698; GFX7-NEXT: v_mov_b32_e32 v7, v8 699; GFX7-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 700; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 701; GFX7-NEXT: v_readfirstlane_b32 s8, v0 702; GFX7-NEXT: v_readfirstlane_b32 s9, v1 703; GFX7-NEXT: v_readfirstlane_b32 s10, v2 704; GFX7-NEXT: v_readfirstlane_b32 s11, v3 705; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 706; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 707; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 708; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 709; GFX7-NEXT: s_waitcnt vmcnt(0) 710; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 711; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 712; GFX7-NEXT: s_cbranch_execnz .LBB2_4 713; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 714; GFX7-NEXT: s_mov_b64 exec, s[12:13] 715; GFX7-NEXT: s_waitcnt vmcnt(0) 716; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 717; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 718; GFX7-NEXT: v_mov_b32_e32 v8, v6 719; GFX7-NEXT: buffer_wbinvl1 720; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 721; GFX7-NEXT: s_cbranch_execnz .LBB2_3 722; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 723; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 724; GFX7-NEXT: v_mov_b32_e32 v0, v6 725; GFX7-NEXT: s_setpc_b64 s[30:31] 726; 727; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: 728; GFX6: ; %bb.0: 729; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 731; GFX6-NEXT: s_mov_b64 s[6:7], exec 732; GFX6-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 733; GFX6-NEXT: v_readfirstlane_b32 s8, v0 734; GFX6-NEXT: v_readfirstlane_b32 s9, v1 735; GFX6-NEXT: v_readfirstlane_b32 s10, v2 736; GFX6-NEXT: v_readfirstlane_b32 s11, v3 737; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 738; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 739; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 740; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 741; GFX6-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 742; GFX6-NEXT: ; implicit-def: $vgpr4 743; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 744; GFX6-NEXT: s_cbranch_execnz .LBB2_1 745; GFX6-NEXT: ; %bb.2: 746; GFX6-NEXT: s_mov_b64 exec, s[6:7] 747; GFX6-NEXT: s_mov_b64 s[6:7], 0 748; GFX6-NEXT: .LBB2_3: ; %atomicrmw.start 749; GFX6-NEXT: ; =>This Loop Header: Depth=1 750; GFX6-NEXT: ; Child Loop BB2_4 Depth 2 751; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 752; GFX6-NEXT: v_add_f32_e32 v7, v8, v5 753; GFX6-NEXT: v_mov_b32_e32 v6, v7 754; GFX6-NEXT: s_mov_b64 s[12:13], exec 755; GFX6-NEXT: v_mov_b32_e32 v7, v8 756; GFX6-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 757; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 758; GFX6-NEXT: v_readfirstlane_b32 s8, v0 759; GFX6-NEXT: v_readfirstlane_b32 s9, v1 760; GFX6-NEXT: v_readfirstlane_b32 s10, v2 761; GFX6-NEXT: v_readfirstlane_b32 s11, v3 762; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 763; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 764; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 765; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 766; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 767; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 768; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 769; GFX6-NEXT: s_cbranch_execnz .LBB2_4 770; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 771; GFX6-NEXT: s_mov_b64 exec, s[12:13] 772; GFX6-NEXT: s_waitcnt vmcnt(0) 773; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 774; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 775; GFX6-NEXT: v_mov_b32_e32 v8, v6 776; GFX6-NEXT: buffer_wbinvl1 777; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 778; GFX6-NEXT: s_cbranch_execnz .LBB2_3 779; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 780; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 781; GFX6-NEXT: v_mov_b32_e32 v0, v6 782; GFX6-NEXT: s_waitcnt expcnt(0) 783; GFX6-NEXT: s_setpc_b64 s[30:31] 784 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 785 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 786 ret float %result 787} 788 789define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 790; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 791; GFX12: ; %bb.0: 792; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 793; GFX12-NEXT: s_wait_expcnt 0x0 794; GFX12-NEXT: s_wait_samplecnt 0x0 795; GFX12-NEXT: s_wait_bvhcnt 0x0 796; GFX12-NEXT: s_wait_kmcnt 0x0 797; GFX12-NEXT: v_mov_b32_e32 v1, s16 798; GFX12-NEXT: s_wait_storecnt 0x0 799; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 800; GFX12-NEXT: s_wait_loadcnt 0x0 801; GFX12-NEXT: global_inv scope:SCOPE_DEV 802; GFX12-NEXT: s_setpc_b64 s[30:31] 803; 804; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 805; GFX940: ; %bb.0: 806; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 807; GFX940-NEXT: v_mov_b32_e32 v1, s16 808; GFX940-NEXT: buffer_wbl2 sc1 809; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 810; GFX940-NEXT: s_waitcnt vmcnt(0) 811; GFX940-NEXT: buffer_inv sc1 812; GFX940-NEXT: s_setpc_b64 s[30:31] 813; 814; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 815; GFX11: ; %bb.0: 816; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 817; GFX11-NEXT: v_mov_b32_e32 v1, s16 818; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 819; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 820; GFX11-NEXT: s_waitcnt vmcnt(0) 821; GFX11-NEXT: buffer_gl1_inv 822; GFX11-NEXT: buffer_gl0_inv 823; GFX11-NEXT: s_setpc_b64 s[30:31] 824; 825; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 826; GFX10: ; %bb.0: 827; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 828; GFX10-NEXT: v_mov_b32_e32 v2, v0 829; GFX10-NEXT: v_mov_b32_e32 v0, s20 830; GFX10-NEXT: s_add_i32 s4, s20, 0x400 831; GFX10-NEXT: v_mov_b32_e32 v3, s4 832; GFX10-NEXT: s_mov_b32 s4, 0 833; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 834; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start 835; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 836; GFX10-NEXT: s_waitcnt vmcnt(0) 837; GFX10-NEXT: v_mov_b32_e32 v5, v0 838; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 839; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 840; GFX10-NEXT: v_mov_b32_e32 v0, v4 841; GFX10-NEXT: v_mov_b32_e32 v1, v5 842; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 843; GFX10-NEXT: s_waitcnt vmcnt(0) 844; GFX10-NEXT: buffer_gl1_inv 845; GFX10-NEXT: buffer_gl0_inv 846; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 847; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 848; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 849; GFX10-NEXT: s_cbranch_execnz .LBB3_1 850; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 851; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 852; GFX10-NEXT: s_setpc_b64 s[30:31] 853; 854; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 855; GFX90A: ; %bb.0: 856; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; GFX90A-NEXT: v_mov_b32_e32 v2, v0 858; GFX90A-NEXT: v_mov_b32_e32 v0, s20 859; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 860; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 861; GFX90A-NEXT: s_mov_b64 s[4:5], 0 862; GFX90A-NEXT: v_mov_b32_e32 v3, s6 863; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start 864; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 865; GFX90A-NEXT: s_waitcnt vmcnt(0) 866; GFX90A-NEXT: v_mov_b32_e32 v5, v0 867; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 868; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 869; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 870; GFX90A-NEXT: s_waitcnt vmcnt(0) 871; GFX90A-NEXT: buffer_wbinvl1 872; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 873; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 874; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 875; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 876; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 877; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 878; GFX90A-NEXT: s_setpc_b64 s[30:31] 879; 880; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 881; GFX908: ; %bb.0: 882; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 883; GFX908-NEXT: v_mov_b32_e32 v2, v0 884; GFX908-NEXT: v_mov_b32_e32 v0, s20 885; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 886; GFX908-NEXT: s_add_i32 s6, s20, 0x400 887; GFX908-NEXT: s_mov_b64 s[4:5], 0 888; GFX908-NEXT: v_mov_b32_e32 v3, s6 889; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start 890; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 891; GFX908-NEXT: s_waitcnt vmcnt(0) 892; GFX908-NEXT: v_mov_b32_e32 v5, v0 893; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 894; GFX908-NEXT: v_mov_b32_e32 v0, v4 895; GFX908-NEXT: v_mov_b32_e32 v1, v5 896; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 897; GFX908-NEXT: s_waitcnt vmcnt(0) 898; GFX908-NEXT: buffer_wbinvl1 899; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 900; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 901; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 902; GFX908-NEXT: s_cbranch_execnz .LBB3_1 903; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 904; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 905; GFX908-NEXT: s_setpc_b64 s[30:31] 906; 907; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 908; GFX8: ; %bb.0: 909; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 910; GFX8-NEXT: v_mov_b32_e32 v2, v0 911; GFX8-NEXT: v_mov_b32_e32 v0, s20 912; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 913; GFX8-NEXT: s_add_i32 s6, s20, 0x400 914; GFX8-NEXT: s_mov_b64 s[4:5], 0 915; GFX8-NEXT: v_mov_b32_e32 v3, s6 916; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start 917; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 918; GFX8-NEXT: s_waitcnt vmcnt(0) 919; GFX8-NEXT: v_mov_b32_e32 v5, v0 920; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 921; GFX8-NEXT: v_mov_b32_e32 v0, v4 922; GFX8-NEXT: v_mov_b32_e32 v1, v5 923; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 924; GFX8-NEXT: s_waitcnt vmcnt(0) 925; GFX8-NEXT: buffer_wbinvl1 926; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 927; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 928; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 929; GFX8-NEXT: s_cbranch_execnz .LBB3_1 930; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 931; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 932; GFX8-NEXT: s_setpc_b64 s[30:31] 933; 934; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 935; GFX7: ; %bb.0: 936; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 937; GFX7-NEXT: v_mov_b32_e32 v2, v0 938; GFX7-NEXT: v_mov_b32_e32 v0, s20 939; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 940; GFX7-NEXT: s_add_i32 s6, s20, 0x400 941; GFX7-NEXT: s_mov_b64 s[4:5], 0 942; GFX7-NEXT: v_mov_b32_e32 v3, s6 943; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start 944; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 945; GFX7-NEXT: s_waitcnt vmcnt(0) 946; GFX7-NEXT: v_mov_b32_e32 v5, v0 947; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 948; GFX7-NEXT: v_mov_b32_e32 v0, v4 949; GFX7-NEXT: v_mov_b32_e32 v1, v5 950; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 951; GFX7-NEXT: s_waitcnt vmcnt(0) 952; GFX7-NEXT: buffer_wbinvl1 953; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 954; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 955; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 956; GFX7-NEXT: s_cbranch_execnz .LBB3_1 957; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 958; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 959; GFX7-NEXT: s_setpc_b64 s[30:31] 960; 961; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 962; GFX6: ; %bb.0: 963; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 964; GFX6-NEXT: v_mov_b32_e32 v2, v0 965; GFX6-NEXT: v_mov_b32_e32 v0, s20 966; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 967; GFX6-NEXT: s_add_i32 s6, s20, 0x400 968; GFX6-NEXT: s_mov_b64 s[4:5], 0 969; GFX6-NEXT: v_mov_b32_e32 v3, s6 970; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start 971; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 972; GFX6-NEXT: s_waitcnt vmcnt(0) 973; GFX6-NEXT: v_mov_b32_e32 v5, v0 974; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 975; GFX6-NEXT: s_waitcnt expcnt(0) 976; GFX6-NEXT: v_mov_b32_e32 v0, v4 977; GFX6-NEXT: v_mov_b32_e32 v1, v5 978; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 979; GFX6-NEXT: s_waitcnt vmcnt(0) 980; GFX6-NEXT: buffer_wbinvl1 981; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 982; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 983; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 984; GFX6-NEXT: s_cbranch_execnz .LBB3_1 985; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 986; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 987; GFX6-NEXT: s_waitcnt expcnt(0) 988; GFX6-NEXT: s_setpc_b64 s[30:31] 989 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 990 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 991 ret float %result 992} 993 994define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 995; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 996; GFX12: ; %bb.0: 997; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 998; GFX12-NEXT: s_wait_expcnt 0x0 999; GFX12-NEXT: s_wait_samplecnt 0x0 1000; GFX12-NEXT: s_wait_bvhcnt 0x0 1001; GFX12-NEXT: s_wait_kmcnt 0x0 1002; GFX12-NEXT: v_mov_b32_e32 v1, s16 1003; GFX12-NEXT: s_wait_storecnt 0x0 1004; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 1005; GFX12-NEXT: s_wait_storecnt 0x0 1006; GFX12-NEXT: global_inv scope:SCOPE_DEV 1007; GFX12-NEXT: s_setpc_b64 s[30:31] 1008; 1009; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1010; GFX940: ; %bb.0: 1011; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1012; GFX940-NEXT: v_mov_b32_e32 v1, s16 1013; GFX940-NEXT: buffer_wbl2 sc1 1014; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 1015; GFX940-NEXT: s_waitcnt vmcnt(0) 1016; GFX940-NEXT: buffer_inv sc1 1017; GFX940-NEXT: s_setpc_b64 s[30:31] 1018; 1019; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1020; GFX11: ; %bb.0: 1021; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1022; GFX11-NEXT: v_mov_b32_e32 v1, s16 1023; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1024; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 1025; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1026; GFX11-NEXT: buffer_gl1_inv 1027; GFX11-NEXT: buffer_gl0_inv 1028; GFX11-NEXT: s_setpc_b64 s[30:31] 1029; 1030; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1031; GFX10: ; %bb.0: 1032; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1033; GFX10-NEXT: v_mov_b32_e32 v1, s20 1034; GFX10-NEXT: s_add_i32 s4, s20, 0x400 1035; GFX10-NEXT: v_mov_b32_e32 v3, s4 1036; GFX10-NEXT: s_mov_b32 s4, 0 1037; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 1038; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start 1039; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1040; GFX10-NEXT: s_waitcnt vmcnt(0) 1041; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 1042; GFX10-NEXT: v_mov_b32_e32 v5, v2 1043; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1044; GFX10-NEXT: v_mov_b32_e32 v4, v1 1045; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 1046; GFX10-NEXT: s_waitcnt vmcnt(0) 1047; GFX10-NEXT: buffer_gl1_inv 1048; GFX10-NEXT: buffer_gl0_inv 1049; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 1050; GFX10-NEXT: v_mov_b32_e32 v2, v4 1051; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1052; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1053; GFX10-NEXT: s_cbranch_execnz .LBB4_1 1054; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1055; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1056; GFX10-NEXT: s_setpc_b64 s[30:31] 1057; 1058; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1059; GFX90A: ; %bb.0: 1060; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1061; GFX90A-NEXT: v_mov_b32_e32 v1, s20 1062; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 1063; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 1064; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1065; GFX90A-NEXT: v_mov_b32_e32 v1, s6 1066; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start 1067; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1068; GFX90A-NEXT: s_waitcnt vmcnt(0) 1069; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 1070; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 1071; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc 1072; GFX90A-NEXT: s_waitcnt vmcnt(0) 1073; GFX90A-NEXT: buffer_wbinvl1 1074; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 1075; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1076; GFX90A-NEXT: v_mov_b32_e32 v3, v4 1077; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1078; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 1079; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1080; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1081; GFX90A-NEXT: s_setpc_b64 s[30:31] 1082; 1083; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1084; GFX908: ; %bb.0: 1085; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1086; GFX908-NEXT: v_mov_b32_e32 v1, s20 1087; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 1088; GFX908-NEXT: s_add_i32 s6, s20, 0x400 1089; GFX908-NEXT: s_mov_b64 s[4:5], 0 1090; GFX908-NEXT: v_mov_b32_e32 v3, s6 1091; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start 1092; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1093; GFX908-NEXT: s_waitcnt vmcnt(0) 1094; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 1095; GFX908-NEXT: v_mov_b32_e32 v5, v2 1096; GFX908-NEXT: v_mov_b32_e32 v4, v1 1097; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 1098; GFX908-NEXT: s_waitcnt vmcnt(0) 1099; GFX908-NEXT: buffer_wbinvl1 1100; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 1101; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1102; GFX908-NEXT: v_mov_b32_e32 v2, v4 1103; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1104; GFX908-NEXT: s_cbranch_execnz .LBB4_1 1105; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1106; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1107; GFX908-NEXT: s_setpc_b64 s[30:31] 1108; 1109; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1110; GFX8: ; %bb.0: 1111; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1112; GFX8-NEXT: v_mov_b32_e32 v1, s20 1113; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 1114; GFX8-NEXT: s_add_i32 s6, s20, 0x400 1115; GFX8-NEXT: s_mov_b64 s[4:5], 0 1116; GFX8-NEXT: v_mov_b32_e32 v3, s6 1117; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 1118; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1119; GFX8-NEXT: s_waitcnt vmcnt(0) 1120; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 1121; GFX8-NEXT: v_mov_b32_e32 v5, v2 1122; GFX8-NEXT: v_mov_b32_e32 v4, v1 1123; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 1124; GFX8-NEXT: s_waitcnt vmcnt(0) 1125; GFX8-NEXT: buffer_wbinvl1 1126; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 1127; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1128; GFX8-NEXT: v_mov_b32_e32 v2, v4 1129; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1130; GFX8-NEXT: s_cbranch_execnz .LBB4_1 1131; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1132; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1133; GFX8-NEXT: s_setpc_b64 s[30:31] 1134; 1135; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1136; GFX7: ; %bb.0: 1137; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1138; GFX7-NEXT: v_mov_b32_e32 v1, s20 1139; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 1140; GFX7-NEXT: s_add_i32 s6, s20, 0x400 1141; GFX7-NEXT: s_mov_b64 s[4:5], 0 1142; GFX7-NEXT: v_mov_b32_e32 v3, s6 1143; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start 1144; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1145; GFX7-NEXT: s_waitcnt vmcnt(0) 1146; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 1147; GFX7-NEXT: v_mov_b32_e32 v5, v2 1148; GFX7-NEXT: v_mov_b32_e32 v4, v1 1149; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 1150; GFX7-NEXT: s_waitcnt vmcnt(0) 1151; GFX7-NEXT: buffer_wbinvl1 1152; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 1153; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1154; GFX7-NEXT: v_mov_b32_e32 v2, v4 1155; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1156; GFX7-NEXT: s_cbranch_execnz .LBB4_1 1157; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1158; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1159; GFX7-NEXT: s_setpc_b64 s[30:31] 1160; 1161; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: 1162; GFX6: ; %bb.0: 1163; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1164; GFX6-NEXT: v_mov_b32_e32 v1, s20 1165; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 1166; GFX6-NEXT: s_add_i32 s6, s20, 0x400 1167; GFX6-NEXT: s_mov_b64 s[4:5], 0 1168; GFX6-NEXT: v_mov_b32_e32 v3, s6 1169; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start 1170; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1171; GFX6-NEXT: s_waitcnt vmcnt(0) 1172; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 1173; GFX6-NEXT: s_waitcnt expcnt(0) 1174; GFX6-NEXT: v_mov_b32_e32 v5, v2 1175; GFX6-NEXT: v_mov_b32_e32 v4, v1 1176; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 1177; GFX6-NEXT: s_waitcnt vmcnt(0) 1178; GFX6-NEXT: buffer_wbinvl1 1179; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 1180; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1181; GFX6-NEXT: v_mov_b32_e32 v2, v4 1182; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 1183; GFX6-NEXT: s_cbranch_execnz .LBB4_1 1184; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 1185; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 1186; GFX6-NEXT: s_waitcnt expcnt(0) 1187; GFX6-NEXT: s_setpc_b64 s[30:31] 1188 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1189 %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1190 ret void 1191} 1192 1193define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) { 1194; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1195; GFX12: ; %bb.0: 1196; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1197; GFX12-NEXT: s_wait_expcnt 0x0 1198; GFX12-NEXT: s_wait_samplecnt 0x0 1199; GFX12-NEXT: s_wait_bvhcnt 0x0 1200; GFX12-NEXT: s_wait_kmcnt 0x0 1201; GFX12-NEXT: v_mov_b32_e32 v1, s16 1202; GFX12-NEXT: s_wait_storecnt 0x0 1203; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 1204; GFX12-NEXT: s_wait_loadcnt 0x0 1205; GFX12-NEXT: global_inv scope:SCOPE_DEV 1206; GFX12-NEXT: s_setpc_b64 s[30:31] 1207; 1208; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1209; GFX940: ; %bb.0: 1210; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; GFX940-NEXT: v_mov_b32_e32 v1, s16 1212; GFX940-NEXT: buffer_wbl2 sc1 1213; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 1214; GFX940-NEXT: s_waitcnt vmcnt(0) 1215; GFX940-NEXT: buffer_inv sc1 1216; GFX940-NEXT: s_setpc_b64 s[30:31] 1217; 1218; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1219; GFX11: ; %bb.0: 1220; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1221; GFX11-NEXT: s_add_i32 s4, s16, 0x400 1222; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1223; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 1224; GFX11-NEXT: v_mov_b32_e32 v0, s16 1225; GFX11-NEXT: s_mov_b32 s4, 0 1226; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 1227; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start 1228; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1229; GFX11-NEXT: s_waitcnt vmcnt(0) 1230; GFX11-NEXT: v_mov_b32_e32 v5, v0 1231; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1232; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1233; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 1234; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 1235; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 1236; GFX11-NEXT: s_waitcnt vmcnt(0) 1237; GFX11-NEXT: buffer_gl1_inv 1238; GFX11-NEXT: buffer_gl0_inv 1239; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1240; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1241; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1242; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1243; GFX11-NEXT: s_cbranch_execnz .LBB5_1 1244; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1245; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1246; GFX11-NEXT: s_setpc_b64 s[30:31] 1247; 1248; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1249; GFX10: ; %bb.0: 1250; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1251; GFX10-NEXT: v_mov_b32_e32 v2, v0 1252; GFX10-NEXT: v_mov_b32_e32 v0, s20 1253; GFX10-NEXT: s_add_i32 s4, s20, 0x400 1254; GFX10-NEXT: v_mov_b32_e32 v3, s4 1255; GFX10-NEXT: s_mov_b32 s4, 0 1256; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1257; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start 1258; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1259; GFX10-NEXT: s_waitcnt vmcnt(0) 1260; GFX10-NEXT: v_mov_b32_e32 v5, v0 1261; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1262; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 1263; GFX10-NEXT: v_mov_b32_e32 v0, v4 1264; GFX10-NEXT: v_mov_b32_e32 v1, v5 1265; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1266; GFX10-NEXT: s_waitcnt vmcnt(0) 1267; GFX10-NEXT: buffer_gl1_inv 1268; GFX10-NEXT: buffer_gl0_inv 1269; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1270; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1271; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1272; GFX10-NEXT: s_cbranch_execnz .LBB5_1 1273; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1274; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1275; GFX10-NEXT: s_setpc_b64 s[30:31] 1276; 1277; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1278; GFX90A: ; %bb.0: 1279; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1280; GFX90A-NEXT: v_mov_b32_e32 v2, v0 1281; GFX90A-NEXT: v_mov_b32_e32 v0, s20 1282; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1283; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 1284; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1285; GFX90A-NEXT: v_mov_b32_e32 v3, s6 1286; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start 1287; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1288; GFX90A-NEXT: s_waitcnt vmcnt(0) 1289; GFX90A-NEXT: v_mov_b32_e32 v5, v0 1290; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 1291; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 1292; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1293; GFX90A-NEXT: s_waitcnt vmcnt(0) 1294; GFX90A-NEXT: buffer_wbinvl1 1295; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1296; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1297; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1298; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 1299; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1300; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1301; GFX90A-NEXT: s_setpc_b64 s[30:31] 1302; 1303; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1304; GFX908: ; %bb.0: 1305; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1306; GFX908-NEXT: v_mov_b32_e32 v2, v0 1307; GFX908-NEXT: v_mov_b32_e32 v0, s20 1308; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1309; GFX908-NEXT: s_add_i32 s6, s20, 0x400 1310; GFX908-NEXT: s_mov_b64 s[4:5], 0 1311; GFX908-NEXT: v_mov_b32_e32 v3, s6 1312; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start 1313; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1314; GFX908-NEXT: s_waitcnt vmcnt(0) 1315; GFX908-NEXT: v_mov_b32_e32 v5, v0 1316; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 1317; GFX908-NEXT: v_mov_b32_e32 v0, v4 1318; GFX908-NEXT: v_mov_b32_e32 v1, v5 1319; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1320; GFX908-NEXT: s_waitcnt vmcnt(0) 1321; GFX908-NEXT: buffer_wbinvl1 1322; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1323; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1324; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1325; GFX908-NEXT: s_cbranch_execnz .LBB5_1 1326; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1327; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1328; GFX908-NEXT: s_setpc_b64 s[30:31] 1329; 1330; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1331; GFX8: ; %bb.0: 1332; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1333; GFX8-NEXT: v_mov_b32_e32 v2, v0 1334; GFX8-NEXT: v_mov_b32_e32 v0, s20 1335; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1336; GFX8-NEXT: s_add_i32 s6, s20, 0x400 1337; GFX8-NEXT: s_mov_b64 s[4:5], 0 1338; GFX8-NEXT: v_mov_b32_e32 v3, s6 1339; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 1340; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1341; GFX8-NEXT: s_waitcnt vmcnt(0) 1342; GFX8-NEXT: v_mov_b32_e32 v5, v0 1343; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 1344; GFX8-NEXT: v_mov_b32_e32 v0, v4 1345; GFX8-NEXT: v_mov_b32_e32 v1, v5 1346; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1347; GFX8-NEXT: s_waitcnt vmcnt(0) 1348; GFX8-NEXT: buffer_wbinvl1 1349; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1350; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1351; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1352; GFX8-NEXT: s_cbranch_execnz .LBB5_1 1353; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1354; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1355; GFX8-NEXT: s_setpc_b64 s[30:31] 1356; 1357; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1358; GFX7: ; %bb.0: 1359; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1360; GFX7-NEXT: v_mov_b32_e32 v2, v0 1361; GFX7-NEXT: v_mov_b32_e32 v0, s20 1362; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1363; GFX7-NEXT: s_add_i32 s6, s20, 0x400 1364; GFX7-NEXT: s_mov_b64 s[4:5], 0 1365; GFX7-NEXT: v_mov_b32_e32 v3, s6 1366; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start 1367; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1368; GFX7-NEXT: s_waitcnt vmcnt(0) 1369; GFX7-NEXT: v_mov_b32_e32 v5, v0 1370; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 1371; GFX7-NEXT: v_mov_b32_e32 v0, v4 1372; GFX7-NEXT: v_mov_b32_e32 v1, v5 1373; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1374; GFX7-NEXT: s_waitcnt vmcnt(0) 1375; GFX7-NEXT: buffer_wbinvl1 1376; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1377; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1378; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1379; GFX7-NEXT: s_cbranch_execnz .LBB5_1 1380; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1381; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1382; GFX7-NEXT: s_setpc_b64 s[30:31] 1383; 1384; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: 1385; GFX6: ; %bb.0: 1386; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1387; GFX6-NEXT: v_mov_b32_e32 v2, v0 1388; GFX6-NEXT: v_mov_b32_e32 v0, s20 1389; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1390; GFX6-NEXT: s_add_i32 s6, s20, 0x400 1391; GFX6-NEXT: s_mov_b64 s[4:5], 0 1392; GFX6-NEXT: v_mov_b32_e32 v3, s6 1393; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start 1394; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1395; GFX6-NEXT: s_waitcnt vmcnt(0) 1396; GFX6-NEXT: v_mov_b32_e32 v5, v0 1397; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 1398; GFX6-NEXT: s_waitcnt expcnt(0) 1399; GFX6-NEXT: v_mov_b32_e32 v0, v4 1400; GFX6-NEXT: v_mov_b32_e32 v1, v5 1401; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1402; GFX6-NEXT: s_waitcnt vmcnt(0) 1403; GFX6-NEXT: buffer_wbinvl1 1404; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1405; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1406; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 1407; GFX6-NEXT: s_cbranch_execnz .LBB5_1 1408; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 1409; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 1410; GFX6-NEXT: s_waitcnt expcnt(0) 1411; GFX6-NEXT: s_setpc_b64 s[30:31] 1412 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1413 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst 1414 ret float %result 1415} 1416 1417define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 1418; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1419; GFX12: ; %bb.0: 1420; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1421; GFX12-NEXT: s_wait_expcnt 0x0 1422; GFX12-NEXT: s_wait_samplecnt 0x0 1423; GFX12-NEXT: s_wait_bvhcnt 0x0 1424; GFX12-NEXT: s_wait_kmcnt 0x0 1425; GFX12-NEXT: v_mov_b32_e32 v1, s16 1426; GFX12-NEXT: s_wait_storecnt 0x0 1427; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 1428; GFX12-NEXT: s_wait_loadcnt 0x0 1429; GFX12-NEXT: global_inv scope:SCOPE_DEV 1430; GFX12-NEXT: s_setpc_b64 s[30:31] 1431; 1432; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1433; GFX940: ; %bb.0: 1434; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1435; GFX940-NEXT: v_mov_b32_e32 v1, s16 1436; GFX940-NEXT: buffer_wbl2 sc1 1437; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 1438; GFX940-NEXT: s_waitcnt vmcnt(0) 1439; GFX940-NEXT: buffer_inv sc1 1440; GFX940-NEXT: s_setpc_b64 s[30:31] 1441; 1442; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1443; GFX11: ; %bb.0: 1444; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1445; GFX11-NEXT: s_add_i32 s4, s16, 0x400 1446; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1447; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 1448; GFX11-NEXT: v_mov_b32_e32 v0, s16 1449; GFX11-NEXT: s_mov_b32 s4, 0 1450; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 1451; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start 1452; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1453; GFX11-NEXT: s_waitcnt vmcnt(0) 1454; GFX11-NEXT: v_mov_b32_e32 v5, v0 1455; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1456; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1457; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 1458; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 1459; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 1460; GFX11-NEXT: s_waitcnt vmcnt(0) 1461; GFX11-NEXT: buffer_gl1_inv 1462; GFX11-NEXT: buffer_gl0_inv 1463; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1464; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1465; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1466; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1467; GFX11-NEXT: s_cbranch_execnz .LBB6_1 1468; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1469; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1470; GFX11-NEXT: s_setpc_b64 s[30:31] 1471; 1472; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1473; GFX10: ; %bb.0: 1474; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1475; GFX10-NEXT: v_mov_b32_e32 v2, v0 1476; GFX10-NEXT: v_mov_b32_e32 v0, s20 1477; GFX10-NEXT: s_add_i32 s4, s20, 0x400 1478; GFX10-NEXT: v_mov_b32_e32 v3, s4 1479; GFX10-NEXT: s_mov_b32 s4, 0 1480; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1481; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start 1482; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1483; GFX10-NEXT: s_waitcnt vmcnt(0) 1484; GFX10-NEXT: v_mov_b32_e32 v5, v0 1485; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1486; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 1487; GFX10-NEXT: v_mov_b32_e32 v0, v4 1488; GFX10-NEXT: v_mov_b32_e32 v1, v5 1489; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1490; GFX10-NEXT: s_waitcnt vmcnt(0) 1491; GFX10-NEXT: buffer_gl1_inv 1492; GFX10-NEXT: buffer_gl0_inv 1493; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1494; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1495; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1496; GFX10-NEXT: s_cbranch_execnz .LBB6_1 1497; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1498; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1499; GFX10-NEXT: s_setpc_b64 s[30:31] 1500; 1501; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1502; GFX90A: ; %bb.0: 1503; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1504; GFX90A-NEXT: v_mov_b32_e32 v2, v0 1505; GFX90A-NEXT: v_mov_b32_e32 v0, s20 1506; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1507; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 1508; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1509; GFX90A-NEXT: v_mov_b32_e32 v3, s6 1510; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start 1511; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1512; GFX90A-NEXT: s_waitcnt vmcnt(0) 1513; GFX90A-NEXT: v_mov_b32_e32 v5, v0 1514; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 1515; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 1516; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1517; GFX90A-NEXT: s_waitcnt vmcnt(0) 1518; GFX90A-NEXT: buffer_wbinvl1 1519; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1520; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1521; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1522; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 1523; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1524; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1525; GFX90A-NEXT: s_setpc_b64 s[30:31] 1526; 1527; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1528; GFX908: ; %bb.0: 1529; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1530; GFX908-NEXT: v_mov_b32_e32 v2, v0 1531; GFX908-NEXT: v_mov_b32_e32 v0, s20 1532; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1533; GFX908-NEXT: s_add_i32 s6, s20, 0x400 1534; GFX908-NEXT: s_mov_b64 s[4:5], 0 1535; GFX908-NEXT: v_mov_b32_e32 v3, s6 1536; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 1537; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1538; GFX908-NEXT: s_waitcnt vmcnt(0) 1539; GFX908-NEXT: v_mov_b32_e32 v5, v0 1540; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 1541; GFX908-NEXT: v_mov_b32_e32 v0, v4 1542; GFX908-NEXT: v_mov_b32_e32 v1, v5 1543; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1544; GFX908-NEXT: s_waitcnt vmcnt(0) 1545; GFX908-NEXT: buffer_wbinvl1 1546; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1547; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1548; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1549; GFX908-NEXT: s_cbranch_execnz .LBB6_1 1550; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1551; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1552; GFX908-NEXT: s_setpc_b64 s[30:31] 1553; 1554; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1555; GFX8: ; %bb.0: 1556; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1557; GFX8-NEXT: v_mov_b32_e32 v2, v0 1558; GFX8-NEXT: v_mov_b32_e32 v0, s20 1559; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1560; GFX8-NEXT: s_add_i32 s6, s20, 0x400 1561; GFX8-NEXT: s_mov_b64 s[4:5], 0 1562; GFX8-NEXT: v_mov_b32_e32 v3, s6 1563; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 1564; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1565; GFX8-NEXT: s_waitcnt vmcnt(0) 1566; GFX8-NEXT: v_mov_b32_e32 v5, v0 1567; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 1568; GFX8-NEXT: v_mov_b32_e32 v0, v4 1569; GFX8-NEXT: v_mov_b32_e32 v1, v5 1570; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1571; GFX8-NEXT: s_waitcnt vmcnt(0) 1572; GFX8-NEXT: buffer_wbinvl1 1573; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1574; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1575; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1576; GFX8-NEXT: s_cbranch_execnz .LBB6_1 1577; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1578; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1579; GFX8-NEXT: s_setpc_b64 s[30:31] 1580; 1581; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1582; GFX7: ; %bb.0: 1583; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1584; GFX7-NEXT: v_mov_b32_e32 v2, v0 1585; GFX7-NEXT: v_mov_b32_e32 v0, s20 1586; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1587; GFX7-NEXT: s_add_i32 s6, s20, 0x400 1588; GFX7-NEXT: s_mov_b64 s[4:5], 0 1589; GFX7-NEXT: v_mov_b32_e32 v3, s6 1590; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start 1591; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1592; GFX7-NEXT: s_waitcnt vmcnt(0) 1593; GFX7-NEXT: v_mov_b32_e32 v5, v0 1594; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 1595; GFX7-NEXT: v_mov_b32_e32 v0, v4 1596; GFX7-NEXT: v_mov_b32_e32 v1, v5 1597; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1598; GFX7-NEXT: s_waitcnt vmcnt(0) 1599; GFX7-NEXT: buffer_wbinvl1 1600; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1601; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1602; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1603; GFX7-NEXT: s_cbranch_execnz .LBB6_1 1604; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1605; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1606; GFX7-NEXT: s_setpc_b64 s[30:31] 1607; 1608; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: 1609; GFX6: ; %bb.0: 1610; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1611; GFX6-NEXT: v_mov_b32_e32 v2, v0 1612; GFX6-NEXT: v_mov_b32_e32 v0, s20 1613; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1614; GFX6-NEXT: s_add_i32 s6, s20, 0x400 1615; GFX6-NEXT: s_mov_b64 s[4:5], 0 1616; GFX6-NEXT: v_mov_b32_e32 v3, s6 1617; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start 1618; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1619; GFX6-NEXT: s_waitcnt vmcnt(0) 1620; GFX6-NEXT: v_mov_b32_e32 v5, v0 1621; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 1622; GFX6-NEXT: s_waitcnt expcnt(0) 1623; GFX6-NEXT: v_mov_b32_e32 v0, v4 1624; GFX6-NEXT: v_mov_b32_e32 v1, v5 1625; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1626; GFX6-NEXT: s_waitcnt vmcnt(0) 1627; GFX6-NEXT: buffer_wbinvl1 1628; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1629; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1630; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 1631; GFX6-NEXT: s_cbranch_execnz .LBB6_1 1632; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 1633; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 1634; GFX6-NEXT: s_waitcnt expcnt(0) 1635; GFX6-NEXT: s_setpc_b64 s[30:31] 1636 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1637 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1638 ret float %result 1639} 1640 1641define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { 1642; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1643; GFX12: ; %bb.0: 1644; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1645; GFX12-NEXT: s_wait_expcnt 0x0 1646; GFX12-NEXT: s_wait_samplecnt 0x0 1647; GFX12-NEXT: s_wait_bvhcnt 0x0 1648; GFX12-NEXT: s_wait_kmcnt 0x0 1649; GFX12-NEXT: v_mov_b32_e32 v1, s16 1650; GFX12-NEXT: s_wait_storecnt 0x0 1651; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 1652; GFX12-NEXT: s_wait_loadcnt 0x0 1653; GFX12-NEXT: global_inv scope:SCOPE_DEV 1654; GFX12-NEXT: s_setpc_b64 s[30:31] 1655; 1656; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1657; GFX940: ; %bb.0: 1658; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1659; GFX940-NEXT: v_mov_b32_e32 v1, s16 1660; GFX940-NEXT: buffer_wbl2 sc1 1661; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 1662; GFX940-NEXT: s_waitcnt vmcnt(0) 1663; GFX940-NEXT: buffer_inv sc1 1664; GFX940-NEXT: s_setpc_b64 s[30:31] 1665; 1666; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1667; GFX11: ; %bb.0: 1668; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1669; GFX11-NEXT: s_add_i32 s4, s16, 0x400 1670; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1671; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 1672; GFX11-NEXT: v_mov_b32_e32 v0, s16 1673; GFX11-NEXT: s_mov_b32 s4, 0 1674; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 1675; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start 1676; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1677; GFX11-NEXT: s_waitcnt vmcnt(0) 1678; GFX11-NEXT: v_mov_b32_e32 v5, v0 1679; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1680; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1681; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 1682; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 1683; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 1684; GFX11-NEXT: s_waitcnt vmcnt(0) 1685; GFX11-NEXT: buffer_gl1_inv 1686; GFX11-NEXT: buffer_gl0_inv 1687; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1688; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1689; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1690; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1691; GFX11-NEXT: s_cbranch_execnz .LBB7_1 1692; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1693; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1694; GFX11-NEXT: s_setpc_b64 s[30:31] 1695; 1696; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1697; GFX10: ; %bb.0: 1698; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1699; GFX10-NEXT: v_mov_b32_e32 v2, v0 1700; GFX10-NEXT: v_mov_b32_e32 v0, s20 1701; GFX10-NEXT: s_add_i32 s4, s20, 0x400 1702; GFX10-NEXT: v_mov_b32_e32 v3, s4 1703; GFX10-NEXT: s_mov_b32 s4, 0 1704; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1705; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start 1706; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1707; GFX10-NEXT: s_waitcnt vmcnt(0) 1708; GFX10-NEXT: v_mov_b32_e32 v5, v0 1709; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1710; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 1711; GFX10-NEXT: v_mov_b32_e32 v0, v4 1712; GFX10-NEXT: v_mov_b32_e32 v1, v5 1713; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1714; GFX10-NEXT: s_waitcnt vmcnt(0) 1715; GFX10-NEXT: buffer_gl1_inv 1716; GFX10-NEXT: buffer_gl0_inv 1717; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 1718; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1719; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1720; GFX10-NEXT: s_cbranch_execnz .LBB7_1 1721; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1722; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1723; GFX10-NEXT: s_setpc_b64 s[30:31] 1724; 1725; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1726; GFX90A: ; %bb.0: 1727; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1728; GFX90A-NEXT: v_mov_b32_e32 v2, v0 1729; GFX90A-NEXT: v_mov_b32_e32 v0, s20 1730; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1731; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 1732; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1733; GFX90A-NEXT: v_mov_b32_e32 v3, s6 1734; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start 1735; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1736; GFX90A-NEXT: s_waitcnt vmcnt(0) 1737; GFX90A-NEXT: v_mov_b32_e32 v5, v0 1738; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 1739; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 1740; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1741; GFX90A-NEXT: s_waitcnt vmcnt(0) 1742; GFX90A-NEXT: buffer_wbinvl1 1743; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1744; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1745; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1746; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 1747; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1748; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1749; GFX90A-NEXT: s_setpc_b64 s[30:31] 1750; 1751; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1752; GFX908: ; %bb.0: 1753; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1754; GFX908-NEXT: v_mov_b32_e32 v2, v0 1755; GFX908-NEXT: v_mov_b32_e32 v0, s20 1756; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1757; GFX908-NEXT: s_add_i32 s6, s20, 0x400 1758; GFX908-NEXT: s_mov_b64 s[4:5], 0 1759; GFX908-NEXT: v_mov_b32_e32 v3, s6 1760; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start 1761; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1762; GFX908-NEXT: s_waitcnt vmcnt(0) 1763; GFX908-NEXT: v_mov_b32_e32 v5, v0 1764; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 1765; GFX908-NEXT: v_mov_b32_e32 v0, v4 1766; GFX908-NEXT: v_mov_b32_e32 v1, v5 1767; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1768; GFX908-NEXT: s_waitcnt vmcnt(0) 1769; GFX908-NEXT: buffer_wbinvl1 1770; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1771; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1772; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1773; GFX908-NEXT: s_cbranch_execnz .LBB7_1 1774; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1775; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1776; GFX908-NEXT: s_setpc_b64 s[30:31] 1777; 1778; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1779; GFX8: ; %bb.0: 1780; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1781; GFX8-NEXT: v_mov_b32_e32 v2, v0 1782; GFX8-NEXT: v_mov_b32_e32 v0, s20 1783; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1784; GFX8-NEXT: s_add_i32 s6, s20, 0x400 1785; GFX8-NEXT: s_mov_b64 s[4:5], 0 1786; GFX8-NEXT: v_mov_b32_e32 v3, s6 1787; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start 1788; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1789; GFX8-NEXT: s_waitcnt vmcnt(0) 1790; GFX8-NEXT: v_mov_b32_e32 v5, v0 1791; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 1792; GFX8-NEXT: v_mov_b32_e32 v0, v4 1793; GFX8-NEXT: v_mov_b32_e32 v1, v5 1794; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1795; GFX8-NEXT: s_waitcnt vmcnt(0) 1796; GFX8-NEXT: buffer_wbinvl1 1797; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1798; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1799; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1800; GFX8-NEXT: s_cbranch_execnz .LBB7_1 1801; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1802; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1803; GFX8-NEXT: s_setpc_b64 s[30:31] 1804; 1805; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1806; GFX7: ; %bb.0: 1807; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1808; GFX7-NEXT: v_mov_b32_e32 v2, v0 1809; GFX7-NEXT: v_mov_b32_e32 v0, s20 1810; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1811; GFX7-NEXT: s_add_i32 s6, s20, 0x400 1812; GFX7-NEXT: s_mov_b64 s[4:5], 0 1813; GFX7-NEXT: v_mov_b32_e32 v3, s6 1814; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start 1815; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1816; GFX7-NEXT: s_waitcnt vmcnt(0) 1817; GFX7-NEXT: v_mov_b32_e32 v5, v0 1818; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 1819; GFX7-NEXT: v_mov_b32_e32 v0, v4 1820; GFX7-NEXT: v_mov_b32_e32 v1, v5 1821; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1822; GFX7-NEXT: s_waitcnt vmcnt(0) 1823; GFX7-NEXT: buffer_wbinvl1 1824; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1825; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1826; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1827; GFX7-NEXT: s_cbranch_execnz .LBB7_1 1828; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1829; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1830; GFX7-NEXT: s_setpc_b64 s[30:31] 1831; 1832; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: 1833; GFX6: ; %bb.0: 1834; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1835; GFX6-NEXT: v_mov_b32_e32 v2, v0 1836; GFX6-NEXT: v_mov_b32_e32 v0, s20 1837; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1838; GFX6-NEXT: s_add_i32 s6, s20, 0x400 1839; GFX6-NEXT: s_mov_b64 s[4:5], 0 1840; GFX6-NEXT: v_mov_b32_e32 v3, s6 1841; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start 1842; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1843; GFX6-NEXT: s_waitcnt vmcnt(0) 1844; GFX6-NEXT: v_mov_b32_e32 v5, v0 1845; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 1846; GFX6-NEXT: s_waitcnt expcnt(0) 1847; GFX6-NEXT: v_mov_b32_e32 v0, v4 1848; GFX6-NEXT: v_mov_b32_e32 v1, v5 1849; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1850; GFX6-NEXT: s_waitcnt vmcnt(0) 1851; GFX6-NEXT: buffer_wbinvl1 1852; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1853; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1854; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 1855; GFX6-NEXT: s_cbranch_execnz .LBB7_1 1856; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 1857; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 1858; GFX6-NEXT: s_waitcnt expcnt(0) 1859; GFX6-NEXT: s_setpc_b64 s[30:31] 1860 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1861 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1862 ret float %result 1863} 1864 1865; -------------------------------------------------------------------- 1866; double 1867; -------------------------------------------------------------------- 1868 1869define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 1870; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1871; GFX12: ; %bb.0: 1872; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1873; GFX12-NEXT: s_wait_expcnt 0x0 1874; GFX12-NEXT: s_wait_samplecnt 0x0 1875; GFX12-NEXT: s_wait_bvhcnt 0x0 1876; GFX12-NEXT: s_wait_kmcnt 0x0 1877; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 1878; GFX12-NEXT: v_mov_b32_e32 v0, s16 1879; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 1880; GFX12-NEXT: s_wait_alu 0xfffe 1881; GFX12-NEXT: v_mov_b32_e32 v6, s4 1882; GFX12-NEXT: s_mov_b32 s4, 0 1883; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 1884; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start 1885; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1886; GFX12-NEXT: s_wait_loadcnt 0x0 1887; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1888; GFX12-NEXT: s_wait_storecnt 0x0 1889; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1890; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] 1891; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1892; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1893; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 1894; GFX12-NEXT: s_wait_loadcnt 0x0 1895; GFX12-NEXT: global_inv scope:SCOPE_DEV 1896; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1897; GFX12-NEXT: s_wait_alu 0xfffe 1898; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 1899; GFX12-NEXT: s_wait_alu 0xfffe 1900; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1901; GFX12-NEXT: s_cbranch_execnz .LBB8_1 1902; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1903; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 1904; GFX12-NEXT: s_wait_alu 0xfffe 1905; GFX12-NEXT: s_setpc_b64 s[30:31] 1906; 1907; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1908; GFX940: ; %bb.0: 1909; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1910; GFX940-NEXT: v_mov_b32_e32 v2, s16 1911; GFX940-NEXT: buffer_wbl2 sc1 1912; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 1913; GFX940-NEXT: s_waitcnt vmcnt(0) 1914; GFX940-NEXT: buffer_inv sc1 1915; GFX940-NEXT: s_setpc_b64 s[30:31] 1916; 1917; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1918; GFX11: ; %bb.0: 1919; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1920; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 1921; GFX11-NEXT: v_mov_b32_e32 v0, s16 1922; GFX11-NEXT: s_add_i32 s4, s16, 0x800 1923; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1924; GFX11-NEXT: v_mov_b32_e32 v6, s4 1925; GFX11-NEXT: s_mov_b32 s4, 0 1926; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 1927; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 1928; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1929; GFX11-NEXT: s_waitcnt vmcnt(0) 1930; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1931; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1932; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1933; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 1934; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1935; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1936; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 1937; GFX11-NEXT: s_waitcnt vmcnt(0) 1938; GFX11-NEXT: buffer_gl1_inv 1939; GFX11-NEXT: buffer_gl0_inv 1940; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1941; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1942; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1943; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1944; GFX11-NEXT: s_cbranch_execnz .LBB8_1 1945; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1946; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1947; GFX11-NEXT: s_setpc_b64 s[30:31] 1948; 1949; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1950; GFX10: ; %bb.0: 1951; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1952; GFX10-NEXT: v_mov_b32_e32 v4, v0 1953; GFX10-NEXT: v_mov_b32_e32 v0, s20 1954; GFX10-NEXT: v_mov_b32_e32 v5, v1 1955; GFX10-NEXT: s_add_i32 s4, s20, 0x800 1956; GFX10-NEXT: v_mov_b32_e32 v6, s4 1957; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 1958; GFX10-NEXT: s_mov_b32 s4, 0 1959; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 1960; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1961; GFX10-NEXT: s_waitcnt vmcnt(0) 1962; GFX10-NEXT: v_mov_b32_e32 v10, v1 1963; GFX10-NEXT: v_mov_b32_e32 v9, v0 1964; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1965; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 1966; GFX10-NEXT: v_mov_b32_e32 v0, v7 1967; GFX10-NEXT: v_mov_b32_e32 v1, v8 1968; GFX10-NEXT: v_mov_b32_e32 v2, v9 1969; GFX10-NEXT: v_mov_b32_e32 v3, v10 1970; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 1971; GFX10-NEXT: s_waitcnt vmcnt(0) 1972; GFX10-NEXT: buffer_gl1_inv 1973; GFX10-NEXT: buffer_gl0_inv 1974; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1975; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1976; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1977; GFX10-NEXT: s_cbranch_execnz .LBB8_1 1978; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1979; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1980; GFX10-NEXT: s_setpc_b64 s[30:31] 1981; 1982; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1983; GFX90A: ; %bb.0: 1984; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1985; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1986; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 1987; GFX90A-NEXT: s_waitcnt vmcnt(0) 1988; GFX90A-NEXT: buffer_wbinvl1 1989; GFX90A-NEXT: s_setpc_b64 s[30:31] 1990; 1991; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 1992; GFX908: ; %bb.0: 1993; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1994; GFX908-NEXT: v_mov_b32_e32 v4, v0 1995; GFX908-NEXT: v_mov_b32_e32 v0, s20 1996; GFX908-NEXT: v_mov_b32_e32 v5, v1 1997; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 1998; GFX908-NEXT: s_add_i32 s6, s20, 0x800 1999; GFX908-NEXT: s_mov_b64 s[4:5], 0 2000; GFX908-NEXT: v_mov_b32_e32 v6, s6 2001; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 2002; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2003; GFX908-NEXT: s_waitcnt vmcnt(0) 2004; GFX908-NEXT: v_mov_b32_e32 v10, v1 2005; GFX908-NEXT: v_mov_b32_e32 v9, v0 2006; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2007; GFX908-NEXT: v_mov_b32_e32 v0, v7 2008; GFX908-NEXT: v_mov_b32_e32 v1, v8 2009; GFX908-NEXT: v_mov_b32_e32 v2, v9 2010; GFX908-NEXT: v_mov_b32_e32 v3, v10 2011; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2012; GFX908-NEXT: s_waitcnt vmcnt(0) 2013; GFX908-NEXT: buffer_wbinvl1 2014; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2015; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2016; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2017; GFX908-NEXT: s_cbranch_execnz .LBB8_1 2018; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2019; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2020; GFX908-NEXT: s_setpc_b64 s[30:31] 2021; 2022; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 2023; GFX8: ; %bb.0: 2024; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2025; GFX8-NEXT: v_mov_b32_e32 v4, v0 2026; GFX8-NEXT: v_mov_b32_e32 v0, s20 2027; GFX8-NEXT: v_mov_b32_e32 v5, v1 2028; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2029; GFX8-NEXT: s_add_i32 s6, s20, 0x800 2030; GFX8-NEXT: s_mov_b64 s[4:5], 0 2031; GFX8-NEXT: v_mov_b32_e32 v6, s6 2032; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 2033; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2034; GFX8-NEXT: s_waitcnt vmcnt(0) 2035; GFX8-NEXT: v_mov_b32_e32 v10, v1 2036; GFX8-NEXT: v_mov_b32_e32 v9, v0 2037; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2038; GFX8-NEXT: v_mov_b32_e32 v0, v7 2039; GFX8-NEXT: v_mov_b32_e32 v1, v8 2040; GFX8-NEXT: v_mov_b32_e32 v2, v9 2041; GFX8-NEXT: v_mov_b32_e32 v3, v10 2042; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2043; GFX8-NEXT: s_waitcnt vmcnt(0) 2044; GFX8-NEXT: buffer_wbinvl1 2045; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2046; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2047; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2048; GFX8-NEXT: s_cbranch_execnz .LBB8_1 2049; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2050; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2051; GFX8-NEXT: s_setpc_b64 s[30:31] 2052; 2053; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 2054; GFX7: ; %bb.0: 2055; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2056; GFX7-NEXT: v_mov_b32_e32 v4, v0 2057; GFX7-NEXT: v_mov_b32_e32 v0, s20 2058; GFX7-NEXT: v_mov_b32_e32 v5, v1 2059; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2060; GFX7-NEXT: s_add_i32 s6, s20, 0x800 2061; GFX7-NEXT: s_mov_b64 s[4:5], 0 2062; GFX7-NEXT: v_mov_b32_e32 v6, s6 2063; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start 2064; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2065; GFX7-NEXT: s_waitcnt vmcnt(0) 2066; GFX7-NEXT: v_mov_b32_e32 v10, v1 2067; GFX7-NEXT: v_mov_b32_e32 v9, v0 2068; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2069; GFX7-NEXT: v_mov_b32_e32 v0, v7 2070; GFX7-NEXT: v_mov_b32_e32 v1, v8 2071; GFX7-NEXT: v_mov_b32_e32 v2, v9 2072; GFX7-NEXT: v_mov_b32_e32 v3, v10 2073; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2074; GFX7-NEXT: s_waitcnt vmcnt(0) 2075; GFX7-NEXT: buffer_wbinvl1 2076; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2077; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2078; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2079; GFX7-NEXT: s_cbranch_execnz .LBB8_1 2080; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2081; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2082; GFX7-NEXT: s_setpc_b64 s[30:31] 2083; 2084; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: 2085; GFX6: ; %bb.0: 2086; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2087; GFX6-NEXT: v_mov_b32_e32 v4, v0 2088; GFX6-NEXT: v_mov_b32_e32 v0, s20 2089; GFX6-NEXT: v_mov_b32_e32 v5, v1 2090; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2091; GFX6-NEXT: s_add_i32 s6, s20, 0x800 2092; GFX6-NEXT: s_mov_b64 s[4:5], 0 2093; GFX6-NEXT: v_mov_b32_e32 v6, s6 2094; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start 2095; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 2096; GFX6-NEXT: s_waitcnt vmcnt(0) 2097; GFX6-NEXT: v_mov_b32_e32 v10, v1 2098; GFX6-NEXT: v_mov_b32_e32 v9, v0 2099; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2100; GFX6-NEXT: s_waitcnt expcnt(0) 2101; GFX6-NEXT: v_mov_b32_e32 v0, v7 2102; GFX6-NEXT: v_mov_b32_e32 v1, v8 2103; GFX6-NEXT: v_mov_b32_e32 v2, v9 2104; GFX6-NEXT: v_mov_b32_e32 v3, v10 2105; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2106; GFX6-NEXT: s_waitcnt vmcnt(0) 2107; GFX6-NEXT: buffer_wbinvl1 2108; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2109; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2110; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 2111; GFX6-NEXT: s_cbranch_execnz .LBB8_1 2112; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 2113; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 2114; GFX6-NEXT: s_waitcnt expcnt(0) 2115; GFX6-NEXT: s_setpc_b64 s[30:31] 2116 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2117 %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2118 ret double %result 2119} 2120 2121define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 2122; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2123; GFX12: ; %bb.0: 2124; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2125; GFX12-NEXT: s_wait_expcnt 0x0 2126; GFX12-NEXT: s_wait_samplecnt 0x0 2127; GFX12-NEXT: s_wait_bvhcnt 0x0 2128; GFX12-NEXT: s_wait_kmcnt 0x0 2129; GFX12-NEXT: v_mov_b32_e32 v2, s16 2130; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 2131; GFX12-NEXT: s_wait_alu 0xfffe 2132; GFX12-NEXT: v_mov_b32_e32 v6, s4 2133; GFX12-NEXT: s_mov_b32 s4, 0 2134; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 2135; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start 2136; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2137; GFX12-NEXT: s_wait_loadcnt 0x0 2138; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] 2139; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 2140; GFX12-NEXT: s_wait_storecnt 0x0 2141; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2142; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 2143; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 2144; GFX12-NEXT: s_wait_loadcnt 0x0 2145; GFX12-NEXT: global_inv scope:SCOPE_DEV 2146; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] 2147; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 2148; GFX12-NEXT: s_wait_alu 0xfffe 2149; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 2150; GFX12-NEXT: s_wait_alu 0xfffe 2151; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2152; GFX12-NEXT: s_cbranch_execnz .LBB9_1 2153; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2154; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 2155; GFX12-NEXT: s_wait_alu 0xfffe 2156; GFX12-NEXT: s_setpc_b64 s[30:31] 2157; 2158; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2159; GFX940: ; %bb.0: 2160; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2161; GFX940-NEXT: v_mov_b32_e32 v2, s16 2162; GFX940-NEXT: buffer_wbl2 sc1 2163; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 2164; GFX940-NEXT: s_waitcnt vmcnt(0) 2165; GFX940-NEXT: buffer_inv sc1 2166; GFX940-NEXT: s_setpc_b64 s[30:31] 2167; 2168; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2169; GFX11: ; %bb.0: 2170; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2171; GFX11-NEXT: v_mov_b32_e32 v2, s16 2172; GFX11-NEXT: s_add_i32 s4, s16, 0x800 2173; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2174; GFX11-NEXT: v_mov_b32_e32 v6, s4 2175; GFX11-NEXT: s_mov_b32 s4, 0 2176; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 2177; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start 2178; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2179; GFX11-NEXT: s_waitcnt vmcnt(0) 2180; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2181; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 2182; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2183; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2184; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 2185; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc 2186; GFX11-NEXT: s_waitcnt vmcnt(0) 2187; GFX11-NEXT: buffer_gl1_inv 2188; GFX11-NEXT: buffer_gl0_inv 2189; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] 2190; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 2191; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 2192; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2193; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2194; GFX11-NEXT: s_cbranch_execnz .LBB9_1 2195; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2196; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 2197; GFX11-NEXT: s_setpc_b64 s[30:31] 2198; 2199; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2200; GFX10: ; %bb.0: 2201; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2202; GFX10-NEXT: v_mov_b32_e32 v2, s20 2203; GFX10-NEXT: s_add_i32 s4, s20, 0x800 2204; GFX10-NEXT: v_mov_b32_e32 v6, s4 2205; GFX10-NEXT: s_mov_b32 s4, 0 2206; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 2207; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start 2208; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2209; GFX10-NEXT: s_waitcnt vmcnt(0) 2210; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2211; GFX10-NEXT: v_mov_b32_e32 v10, v5 2212; GFX10-NEXT: v_mov_b32_e32 v9, v4 2213; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2214; GFX10-NEXT: v_mov_b32_e32 v8, v3 2215; GFX10-NEXT: v_mov_b32_e32 v7, v2 2216; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2217; GFX10-NEXT: s_waitcnt vmcnt(0) 2218; GFX10-NEXT: buffer_gl1_inv 2219; GFX10-NEXT: buffer_gl0_inv 2220; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] 2221; GFX10-NEXT: v_mov_b32_e32 v4, v7 2222; GFX10-NEXT: v_mov_b32_e32 v5, v8 2223; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2224; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2225; GFX10-NEXT: s_cbranch_execnz .LBB9_1 2226; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2227; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2228; GFX10-NEXT: s_setpc_b64 s[30:31] 2229; 2230; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2231; GFX90A: ; %bb.0: 2232; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2233; GFX90A-NEXT: v_mov_b32_e32 v2, s20 2234; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 2235; GFX90A-NEXT: s_waitcnt vmcnt(0) 2236; GFX90A-NEXT: buffer_wbinvl1 2237; GFX90A-NEXT: s_setpc_b64 s[30:31] 2238; 2239; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2240; GFX908: ; %bb.0: 2241; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2242; GFX908-NEXT: v_mov_b32_e32 v2, s20 2243; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 2244; GFX908-NEXT: s_add_i32 s6, s20, 0x800 2245; GFX908-NEXT: s_mov_b64 s[4:5], 0 2246; GFX908-NEXT: v_mov_b32_e32 v6, s6 2247; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 2248; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2249; GFX908-NEXT: s_waitcnt vmcnt(0) 2250; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2251; GFX908-NEXT: v_mov_b32_e32 v10, v5 2252; GFX908-NEXT: v_mov_b32_e32 v9, v4 2253; GFX908-NEXT: v_mov_b32_e32 v8, v3 2254; GFX908-NEXT: v_mov_b32_e32 v7, v2 2255; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2256; GFX908-NEXT: s_waitcnt vmcnt(0) 2257; GFX908-NEXT: buffer_wbinvl1 2258; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] 2259; GFX908-NEXT: v_mov_b32_e32 v4, v7 2260; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2261; GFX908-NEXT: v_mov_b32_e32 v5, v8 2262; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2263; GFX908-NEXT: s_cbranch_execnz .LBB9_1 2264; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2265; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2266; GFX908-NEXT: s_setpc_b64 s[30:31] 2267; 2268; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2269; GFX8: ; %bb.0: 2270; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2271; GFX8-NEXT: v_mov_b32_e32 v2, s20 2272; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 2273; GFX8-NEXT: s_add_i32 s6, s20, 0x800 2274; GFX8-NEXT: s_mov_b64 s[4:5], 0 2275; GFX8-NEXT: v_mov_b32_e32 v6, s6 2276; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 2277; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2278; GFX8-NEXT: s_waitcnt vmcnt(0) 2279; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2280; GFX8-NEXT: v_mov_b32_e32 v10, v5 2281; GFX8-NEXT: v_mov_b32_e32 v9, v4 2282; GFX8-NEXT: v_mov_b32_e32 v8, v3 2283; GFX8-NEXT: v_mov_b32_e32 v7, v2 2284; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2285; GFX8-NEXT: s_waitcnt vmcnt(0) 2286; GFX8-NEXT: buffer_wbinvl1 2287; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] 2288; GFX8-NEXT: v_mov_b32_e32 v4, v7 2289; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2290; GFX8-NEXT: v_mov_b32_e32 v5, v8 2291; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2292; GFX8-NEXT: s_cbranch_execnz .LBB9_1 2293; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2294; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2295; GFX8-NEXT: s_setpc_b64 s[30:31] 2296; 2297; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2298; GFX7: ; %bb.0: 2299; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2300; GFX7-NEXT: v_mov_b32_e32 v2, s20 2301; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 2302; GFX7-NEXT: s_add_i32 s6, s20, 0x800 2303; GFX7-NEXT: s_mov_b64 s[4:5], 0 2304; GFX7-NEXT: v_mov_b32_e32 v6, s6 2305; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start 2306; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2307; GFX7-NEXT: s_waitcnt vmcnt(0) 2308; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2309; GFX7-NEXT: v_mov_b32_e32 v10, v5 2310; GFX7-NEXT: v_mov_b32_e32 v9, v4 2311; GFX7-NEXT: v_mov_b32_e32 v8, v3 2312; GFX7-NEXT: v_mov_b32_e32 v7, v2 2313; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2314; GFX7-NEXT: s_waitcnt vmcnt(0) 2315; GFX7-NEXT: buffer_wbinvl1 2316; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] 2317; GFX7-NEXT: v_mov_b32_e32 v4, v7 2318; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2319; GFX7-NEXT: v_mov_b32_e32 v5, v8 2320; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2321; GFX7-NEXT: s_cbranch_execnz .LBB9_1 2322; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2323; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2324; GFX7-NEXT: s_setpc_b64 s[30:31] 2325; 2326; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: 2327; GFX6: ; %bb.0: 2328; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2329; GFX6-NEXT: v_mov_b32_e32 v2, s20 2330; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 2331; GFX6-NEXT: s_add_i32 s6, s20, 0x800 2332; GFX6-NEXT: s_mov_b64 s[4:5], 0 2333; GFX6-NEXT: v_mov_b32_e32 v6, s6 2334; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start 2335; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 2336; GFX6-NEXT: s_waitcnt vmcnt(0) 2337; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] 2338; GFX6-NEXT: s_waitcnt expcnt(0) 2339; GFX6-NEXT: v_mov_b32_e32 v10, v5 2340; GFX6-NEXT: v_mov_b32_e32 v9, v4 2341; GFX6-NEXT: v_mov_b32_e32 v8, v3 2342; GFX6-NEXT: v_mov_b32_e32 v7, v2 2343; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2344; GFX6-NEXT: s_waitcnt vmcnt(0) 2345; GFX6-NEXT: buffer_wbinvl1 2346; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] 2347; GFX6-NEXT: v_mov_b32_e32 v4, v7 2348; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2349; GFX6-NEXT: v_mov_b32_e32 v5, v8 2350; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 2351; GFX6-NEXT: s_cbranch_execnz .LBB9_1 2352; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 2353; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 2354; GFX6-NEXT: s_waitcnt expcnt(0) 2355; GFX6-NEXT: s_setpc_b64 s[30:31] 2356 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2357 %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2358 ret void 2359} 2360 2361define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { 2362; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2363; GFX12: ; %bb.0: 2364; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2365; GFX12-NEXT: s_wait_expcnt 0x0 2366; GFX12-NEXT: s_wait_samplecnt 0x0 2367; GFX12-NEXT: s_wait_bvhcnt 0x0 2368; GFX12-NEXT: s_wait_kmcnt 0x0 2369; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 2370; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2371; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 2372; GFX12-NEXT: s_mov_b32 s1, exec_lo 2373; GFX12-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2374; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 2375; GFX12-NEXT: v_readfirstlane_b32 s4, v9 2376; GFX12-NEXT: v_readfirstlane_b32 s5, v10 2377; GFX12-NEXT: v_readfirstlane_b32 s6, v7 2378; GFX12-NEXT: v_readfirstlane_b32 s7, v8 2379; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2380; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 2381; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 2382; GFX12-NEXT: s_wait_alu 0xfffe 2383; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2384; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 2385; GFX12-NEXT: s_wait_alu 0xfffe 2386; GFX12-NEXT: s_and_saveexec_b32 s0, s0 2387; GFX12-NEXT: s_wait_loadcnt 0x0 2388; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 2389; GFX12-NEXT: ; implicit-def: $vgpr4 2390; GFX12-NEXT: s_wait_alu 0xfffe 2391; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 2392; GFX12-NEXT: s_cbranch_execnz .LBB10_1 2393; GFX12-NEXT: ; %bb.2: 2394; GFX12-NEXT: s_mov_b32 exec_lo, s1 2395; GFX12-NEXT: s_mov_b32 s1, 0 2396; GFX12-NEXT: .LBB10_3: ; %atomicrmw.start 2397; GFX12-NEXT: ; =>This Loop Header: Depth=1 2398; GFX12-NEXT: ; Child Loop BB10_4 Depth 2 2399; GFX12-NEXT: s_wait_loadcnt 0x0 2400; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] 2401; GFX12-NEXT: s_mov_b32 s2, exec_lo 2402; GFX12-NEXT: s_wait_storecnt 0x0 2403; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2404; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 2405; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 2406; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2407; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 2408; GFX12-NEXT: v_readfirstlane_b32 s4, v9 2409; GFX12-NEXT: v_readfirstlane_b32 s5, v10 2410; GFX12-NEXT: v_readfirstlane_b32 s6, v7 2411; GFX12-NEXT: v_readfirstlane_b32 s7, v8 2412; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2413; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 2414; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 2415; GFX12-NEXT: s_wait_alu 0xfffe 2416; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2417; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 2418; GFX12-NEXT: s_wait_alu 0xfffe 2419; GFX12-NEXT: s_and_saveexec_b32 s0, s0 2420; GFX12-NEXT: s_wait_loadcnt 0x0 2421; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN 2422; GFX12-NEXT: s_wait_alu 0xfffe 2423; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 2424; GFX12-NEXT: s_cbranch_execnz .LBB10_4 2425; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2426; GFX12-NEXT: s_mov_b32 exec_lo, s2 2427; GFX12-NEXT: s_wait_loadcnt 0x0 2428; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] 2429; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 2430; GFX12-NEXT: global_inv scope:SCOPE_DEV 2431; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 2432; GFX12-NEXT: s_wait_alu 0xfffe 2433; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 2434; GFX12-NEXT: s_cbranch_execnz .LBB10_3 2435; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 2436; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 2437; GFX12-NEXT: s_wait_alu 0xfffe 2438; GFX12-NEXT: s_setpc_b64 s[30:31] 2439; 2440; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2441; GFX940: ; %bb.0: 2442; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2443; GFX940-NEXT: v_mov_b32_e32 v7, v6 2444; GFX940-NEXT: v_mov_b32_e32 v6, v5 2445; GFX940-NEXT: s_mov_b64 s[2:3], exec 2446; GFX940-NEXT: buffer_wbl2 sc1 2447; GFX940-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2448; GFX940-NEXT: v_readfirstlane_b32 s4, v0 2449; GFX940-NEXT: v_readfirstlane_b32 s5, v1 2450; GFX940-NEXT: v_readfirstlane_b32 s6, v2 2451; GFX940-NEXT: v_readfirstlane_b32 s7, v3 2452; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 2453; GFX940-NEXT: s_nop 0 2454; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 2455; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 2456; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 2457; GFX940-NEXT: s_waitcnt vmcnt(0) 2458; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 2459; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2460; GFX940-NEXT: ; implicit-def: $vgpr4 2461; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 2462; GFX940-NEXT: s_cbranch_execnz .LBB10_1 2463; GFX940-NEXT: ; %bb.2: 2464; GFX940-NEXT: s_mov_b64 exec, s[2:3] 2465; GFX940-NEXT: s_waitcnt vmcnt(0) 2466; GFX940-NEXT: v_mov_b32_e32 v0, v6 2467; GFX940-NEXT: v_mov_b32_e32 v1, v7 2468; GFX940-NEXT: buffer_inv sc1 2469; GFX940-NEXT: s_setpc_b64 s[30:31] 2470; 2471; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2472; GFX11: ; %bb.0: 2473; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2474; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 2475; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2476; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 2477; GFX11-NEXT: s_mov_b32 s1, 0 2478; GFX11-NEXT: s_mov_b32 s2, exec_lo 2479; GFX11-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2480; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 2481; GFX11-NEXT: v_readfirstlane_b32 s4, v9 2482; GFX11-NEXT: v_readfirstlane_b32 s5, v10 2483; GFX11-NEXT: v_readfirstlane_b32 s6, v7 2484; GFX11-NEXT: v_readfirstlane_b32 s7, v8 2485; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 2486; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2487; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 2488; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 2489; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2490; GFX11-NEXT: s_and_saveexec_b32 s0, s0 2491; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 2492; GFX11-NEXT: ; implicit-def: $vgpr4 2493; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 2494; GFX11-NEXT: s_cbranch_execnz .LBB10_1 2495; GFX11-NEXT: ; %bb.2: 2496; GFX11-NEXT: s_mov_b32 exec_lo, s2 2497; GFX11-NEXT: .p2align 6 2498; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start 2499; GFX11-NEXT: ; =>This Loop Header: Depth=1 2500; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 2501; GFX11-NEXT: s_waitcnt vmcnt(0) 2502; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2503; GFX11-NEXT: s_mov_b32 s2, exec_lo 2504; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2505; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2506; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 2507; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 2508; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2509; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 2510; GFX11-NEXT: v_readfirstlane_b32 s4, v9 2511; GFX11-NEXT: v_readfirstlane_b32 s5, v10 2512; GFX11-NEXT: v_readfirstlane_b32 s6, v7 2513; GFX11-NEXT: v_readfirstlane_b32 s7, v8 2514; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2515; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 2516; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 2517; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2518; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 2519; GFX11-NEXT: s_and_saveexec_b32 s0, s0 2520; GFX11-NEXT: s_waitcnt vmcnt(0) 2521; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc 2522; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 2523; GFX11-NEXT: s_cbranch_execnz .LBB10_4 2524; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2525; GFX11-NEXT: s_mov_b32 exec_lo, s2 2526; GFX11-NEXT: s_waitcnt vmcnt(0) 2527; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] 2528; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 2529; GFX11-NEXT: buffer_gl1_inv 2530; GFX11-NEXT: buffer_gl0_inv 2531; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 2532; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2533; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 2534; GFX11-NEXT: s_cbranch_execnz .LBB10_3 2535; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 2536; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 2537; GFX11-NEXT: s_setpc_b64 s[30:31] 2538; 2539; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2540; GFX10: ; %bb.0: 2541; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2542; GFX10-NEXT: v_mov_b32_e32 v8, v3 2543; GFX10-NEXT: v_mov_b32_e32 v7, v2 2544; GFX10-NEXT: v_mov_b32_e32 v10, v1 2545; GFX10-NEXT: v_mov_b32_e32 v9, v0 2546; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 2547; GFX10-NEXT: s_mov_b32 s5, 0 2548; GFX10-NEXT: s_mov_b32 s6, exec_lo 2549; GFX10-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2550; GFX10-NEXT: v_readfirstlane_b32 s8, v9 2551; GFX10-NEXT: v_readfirstlane_b32 s9, v10 2552; GFX10-NEXT: v_readfirstlane_b32 s10, v7 2553; GFX10-NEXT: v_readfirstlane_b32 s11, v8 2554; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] 2555; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] 2556; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 2557; GFX10-NEXT: s_and_saveexec_b32 s4, s4 2558; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 2559; GFX10-NEXT: ; implicit-def: $vgpr4 2560; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2561; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 2562; GFX10-NEXT: s_cbranch_execnz .LBB10_1 2563; GFX10-NEXT: ; %bb.2: 2564; GFX10-NEXT: s_mov_b32 exec_lo, s6 2565; GFX10-NEXT: .LBB10_3: ; %atomicrmw.start 2566; GFX10-NEXT: ; =>This Loop Header: Depth=1 2567; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 2568; GFX10-NEXT: s_waitcnt vmcnt(0) 2569; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2570; GFX10-NEXT: s_mov_b32 s6, exec_lo 2571; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2572; GFX10-NEXT: v_mov_b32_e32 v0, v11 2573; GFX10-NEXT: v_mov_b32_e32 v1, v12 2574; GFX10-NEXT: v_mov_b32_e32 v2, v13 2575; GFX10-NEXT: v_mov_b32_e32 v3, v14 2576; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2577; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 2578; GFX10-NEXT: v_readfirstlane_b32 s8, v9 2579; GFX10-NEXT: v_readfirstlane_b32 s9, v10 2580; GFX10-NEXT: v_readfirstlane_b32 s10, v7 2581; GFX10-NEXT: v_readfirstlane_b32 s11, v8 2582; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] 2583; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] 2584; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 2585; GFX10-NEXT: s_and_saveexec_b32 s4, s4 2586; GFX10-NEXT: s_waitcnt vmcnt(0) 2587; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 2588; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2589; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 2590; GFX10-NEXT: s_cbranch_execnz .LBB10_4 2591; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2592; GFX10-NEXT: s_mov_b32 exec_lo, s6 2593; GFX10-NEXT: s_waitcnt vmcnt(0) 2594; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] 2595; GFX10-NEXT: v_mov_b32_e32 v14, v1 2596; GFX10-NEXT: v_mov_b32_e32 v13, v0 2597; GFX10-NEXT: buffer_gl1_inv 2598; GFX10-NEXT: buffer_gl0_inv 2599; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 2600; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2601; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 2602; GFX10-NEXT: s_cbranch_execnz .LBB10_3 2603; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 2604; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 2605; GFX10-NEXT: s_setpc_b64 s[30:31] 2606; 2607; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2608; GFX90A: ; %bb.0: 2609; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2610; GFX90A-NEXT: v_mov_b32_e32 v7, v6 2611; GFX90A-NEXT: v_mov_b32_e32 v6, v5 2612; GFX90A-NEXT: s_mov_b64 s[6:7], exec 2613; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2614; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 2615; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 2616; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 2617; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 2618; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 2619; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 2620; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2621; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2622; GFX90A-NEXT: s_waitcnt vmcnt(0) 2623; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc 2624; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2625; GFX90A-NEXT: ; implicit-def: $vgpr4 2626; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 2627; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 2628; GFX90A-NEXT: ; %bb.2: 2629; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 2630; GFX90A-NEXT: s_waitcnt vmcnt(0) 2631; GFX90A-NEXT: v_mov_b32_e32 v0, v6 2632; GFX90A-NEXT: v_mov_b32_e32 v1, v7 2633; GFX90A-NEXT: buffer_wbinvl1 2634; GFX90A-NEXT: s_setpc_b64 s[30:31] 2635; 2636; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2637; GFX908: ; %bb.0: 2638; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2639; GFX908-NEXT: v_mov_b32_e32 v8, v3 2640; GFX908-NEXT: v_mov_b32_e32 v7, v2 2641; GFX908-NEXT: v_mov_b32_e32 v10, v1 2642; GFX908-NEXT: v_mov_b32_e32 v9, v0 2643; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 2644; GFX908-NEXT: s_mov_b64 s[6:7], exec 2645; GFX908-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2646; GFX908-NEXT: v_readfirstlane_b32 s8, v9 2647; GFX908-NEXT: v_readfirstlane_b32 s9, v10 2648; GFX908-NEXT: v_readfirstlane_b32 s10, v7 2649; GFX908-NEXT: v_readfirstlane_b32 s11, v8 2650; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2651; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2652; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2653; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2654; GFX908-NEXT: s_nop 0 2655; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 2656; GFX908-NEXT: ; implicit-def: $vgpr4 2657; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 2658; GFX908-NEXT: s_cbranch_execnz .LBB10_1 2659; GFX908-NEXT: ; %bb.2: 2660; GFX908-NEXT: s_mov_b64 exec, s[6:7] 2661; GFX908-NEXT: s_mov_b64 s[6:7], 0 2662; GFX908-NEXT: .LBB10_3: ; %atomicrmw.start 2663; GFX908-NEXT: ; =>This Loop Header: Depth=1 2664; GFX908-NEXT: ; Child Loop BB10_4 Depth 2 2665; GFX908-NEXT: s_waitcnt vmcnt(0) 2666; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2667; GFX908-NEXT: s_mov_b64 s[12:13], exec 2668; GFX908-NEXT: v_mov_b32_e32 v0, v11 2669; GFX908-NEXT: v_mov_b32_e32 v1, v12 2670; GFX908-NEXT: v_mov_b32_e32 v2, v13 2671; GFX908-NEXT: v_mov_b32_e32 v3, v14 2672; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2673; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 2674; GFX908-NEXT: v_readfirstlane_b32 s8, v9 2675; GFX908-NEXT: v_readfirstlane_b32 s9, v10 2676; GFX908-NEXT: v_readfirstlane_b32 s10, v7 2677; GFX908-NEXT: v_readfirstlane_b32 s11, v8 2678; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2679; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2680; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2681; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2682; GFX908-NEXT: s_waitcnt vmcnt(0) 2683; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 2684; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 2685; GFX908-NEXT: s_cbranch_execnz .LBB10_4 2686; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2687; GFX908-NEXT: s_mov_b64 exec, s[12:13] 2688; GFX908-NEXT: s_waitcnt vmcnt(0) 2689; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 2690; GFX908-NEXT: v_mov_b32_e32 v14, v1 2691; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 2692; GFX908-NEXT: v_mov_b32_e32 v13, v0 2693; GFX908-NEXT: buffer_wbinvl1 2694; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 2695; GFX908-NEXT: s_cbranch_execnz .LBB10_3 2696; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 2697; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 2698; GFX908-NEXT: s_setpc_b64 s[30:31] 2699; 2700; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2701; GFX8: ; %bb.0: 2702; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2703; GFX8-NEXT: v_mov_b32_e32 v8, v3 2704; GFX8-NEXT: v_mov_b32_e32 v7, v2 2705; GFX8-NEXT: v_mov_b32_e32 v10, v1 2706; GFX8-NEXT: v_mov_b32_e32 v9, v0 2707; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 2708; GFX8-NEXT: s_mov_b64 s[6:7], exec 2709; GFX8-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2710; GFX8-NEXT: v_readfirstlane_b32 s8, v9 2711; GFX8-NEXT: v_readfirstlane_b32 s9, v10 2712; GFX8-NEXT: v_readfirstlane_b32 s10, v7 2713; GFX8-NEXT: v_readfirstlane_b32 s11, v8 2714; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2715; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2716; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2717; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2718; GFX8-NEXT: s_nop 0 2719; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 2720; GFX8-NEXT: ; implicit-def: $vgpr4 2721; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 2722; GFX8-NEXT: s_cbranch_execnz .LBB10_1 2723; GFX8-NEXT: ; %bb.2: 2724; GFX8-NEXT: s_mov_b64 exec, s[6:7] 2725; GFX8-NEXT: s_mov_b64 s[6:7], 0 2726; GFX8-NEXT: .LBB10_3: ; %atomicrmw.start 2727; GFX8-NEXT: ; =>This Loop Header: Depth=1 2728; GFX8-NEXT: ; Child Loop BB10_4 Depth 2 2729; GFX8-NEXT: s_waitcnt vmcnt(0) 2730; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2731; GFX8-NEXT: s_mov_b64 s[12:13], exec 2732; GFX8-NEXT: v_mov_b32_e32 v0, v11 2733; GFX8-NEXT: v_mov_b32_e32 v1, v12 2734; GFX8-NEXT: v_mov_b32_e32 v2, v13 2735; GFX8-NEXT: v_mov_b32_e32 v3, v14 2736; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2737; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 2738; GFX8-NEXT: v_readfirstlane_b32 s8, v9 2739; GFX8-NEXT: v_readfirstlane_b32 s9, v10 2740; GFX8-NEXT: v_readfirstlane_b32 s10, v7 2741; GFX8-NEXT: v_readfirstlane_b32 s11, v8 2742; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2743; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2744; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2745; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2746; GFX8-NEXT: s_waitcnt vmcnt(0) 2747; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 2748; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 2749; GFX8-NEXT: s_cbranch_execnz .LBB10_4 2750; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2751; GFX8-NEXT: s_mov_b64 exec, s[12:13] 2752; GFX8-NEXT: s_waitcnt vmcnt(0) 2753; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 2754; GFX8-NEXT: v_mov_b32_e32 v14, v1 2755; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 2756; GFX8-NEXT: v_mov_b32_e32 v13, v0 2757; GFX8-NEXT: buffer_wbinvl1 2758; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 2759; GFX8-NEXT: s_cbranch_execnz .LBB10_3 2760; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 2761; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 2762; GFX8-NEXT: s_setpc_b64 s[30:31] 2763; 2764; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2765; GFX7: ; %bb.0: 2766; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2767; GFX7-NEXT: v_mov_b32_e32 v8, v3 2768; GFX7-NEXT: v_mov_b32_e32 v7, v2 2769; GFX7-NEXT: v_mov_b32_e32 v10, v1 2770; GFX7-NEXT: v_mov_b32_e32 v9, v0 2771; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 2772; GFX7-NEXT: s_mov_b64 s[6:7], exec 2773; GFX7-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2774; GFX7-NEXT: v_readfirstlane_b32 s8, v9 2775; GFX7-NEXT: v_readfirstlane_b32 s9, v10 2776; GFX7-NEXT: v_readfirstlane_b32 s10, v7 2777; GFX7-NEXT: v_readfirstlane_b32 s11, v8 2778; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2779; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2780; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2781; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2782; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 2783; GFX7-NEXT: ; implicit-def: $vgpr4 2784; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 2785; GFX7-NEXT: s_cbranch_execnz .LBB10_1 2786; GFX7-NEXT: ; %bb.2: 2787; GFX7-NEXT: s_mov_b64 exec, s[6:7] 2788; GFX7-NEXT: s_mov_b64 s[6:7], 0 2789; GFX7-NEXT: .LBB10_3: ; %atomicrmw.start 2790; GFX7-NEXT: ; =>This Loop Header: Depth=1 2791; GFX7-NEXT: ; Child Loop BB10_4 Depth 2 2792; GFX7-NEXT: s_waitcnt vmcnt(0) 2793; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2794; GFX7-NEXT: s_mov_b64 s[12:13], exec 2795; GFX7-NEXT: v_mov_b32_e32 v0, v11 2796; GFX7-NEXT: v_mov_b32_e32 v1, v12 2797; GFX7-NEXT: v_mov_b32_e32 v2, v13 2798; GFX7-NEXT: v_mov_b32_e32 v3, v14 2799; GFX7-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2800; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 2801; GFX7-NEXT: v_readfirstlane_b32 s8, v9 2802; GFX7-NEXT: v_readfirstlane_b32 s9, v10 2803; GFX7-NEXT: v_readfirstlane_b32 s10, v7 2804; GFX7-NEXT: v_readfirstlane_b32 s11, v8 2805; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2806; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2807; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2808; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2809; GFX7-NEXT: s_waitcnt vmcnt(0) 2810; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 2811; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 2812; GFX7-NEXT: s_cbranch_execnz .LBB10_4 2813; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2814; GFX7-NEXT: s_mov_b64 exec, s[12:13] 2815; GFX7-NEXT: s_waitcnt vmcnt(0) 2816; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 2817; GFX7-NEXT: v_mov_b32_e32 v14, v1 2818; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 2819; GFX7-NEXT: v_mov_b32_e32 v13, v0 2820; GFX7-NEXT: buffer_wbinvl1 2821; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 2822; GFX7-NEXT: s_cbranch_execnz .LBB10_3 2823; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 2824; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 2825; GFX7-NEXT: s_setpc_b64 s[30:31] 2826; 2827; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 2828; GFX6: ; %bb.0: 2829; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2830; GFX6-NEXT: v_mov_b32_e32 v8, v3 2831; GFX6-NEXT: v_mov_b32_e32 v7, v2 2832; GFX6-NEXT: v_mov_b32_e32 v10, v1 2833; GFX6-NEXT: v_mov_b32_e32 v9, v0 2834; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 2835; GFX6-NEXT: s_mov_b64 s[6:7], exec 2836; GFX6-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 2837; GFX6-NEXT: v_readfirstlane_b32 s8, v9 2838; GFX6-NEXT: v_readfirstlane_b32 s9, v10 2839; GFX6-NEXT: v_readfirstlane_b32 s10, v7 2840; GFX6-NEXT: v_readfirstlane_b32 s11, v8 2841; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2842; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2843; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2844; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2845; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 2846; GFX6-NEXT: ; implicit-def: $vgpr4 2847; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 2848; GFX6-NEXT: s_cbranch_execnz .LBB10_1 2849; GFX6-NEXT: ; %bb.2: 2850; GFX6-NEXT: s_mov_b64 exec, s[6:7] 2851; GFX6-NEXT: s_mov_b64 s[6:7], 0 2852; GFX6-NEXT: .LBB10_3: ; %atomicrmw.start 2853; GFX6-NEXT: ; =>This Loop Header: Depth=1 2854; GFX6-NEXT: ; Child Loop BB10_4 Depth 2 2855; GFX6-NEXT: s_waitcnt vmcnt(0) 2856; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] 2857; GFX6-NEXT: s_mov_b64 s[12:13], exec 2858; GFX6-NEXT: s_waitcnt expcnt(0) 2859; GFX6-NEXT: v_mov_b32_e32 v0, v11 2860; GFX6-NEXT: v_mov_b32_e32 v1, v12 2861; GFX6-NEXT: v_mov_b32_e32 v2, v13 2862; GFX6-NEXT: v_mov_b32_e32 v3, v14 2863; GFX6-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 2864; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 2865; GFX6-NEXT: v_readfirstlane_b32 s8, v9 2866; GFX6-NEXT: v_readfirstlane_b32 s9, v10 2867; GFX6-NEXT: v_readfirstlane_b32 s10, v7 2868; GFX6-NEXT: v_readfirstlane_b32 s11, v8 2869; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 2870; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 2871; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 2872; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 2873; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2874; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 2875; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 2876; GFX6-NEXT: s_cbranch_execnz .LBB10_4 2877; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 2878; GFX6-NEXT: s_mov_b64 exec, s[12:13] 2879; GFX6-NEXT: s_waitcnt vmcnt(0) 2880; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 2881; GFX6-NEXT: v_mov_b32_e32 v14, v1 2882; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 2883; GFX6-NEXT: v_mov_b32_e32 v13, v0 2884; GFX6-NEXT: buffer_wbinvl1 2885; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 2886; GFX6-NEXT: s_cbranch_execnz .LBB10_3 2887; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 2888; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 2889; GFX6-NEXT: s_waitcnt expcnt(0) 2890; GFX6-NEXT: s_setpc_b64 s[30:31] 2891 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2892 %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2893 ret double %result 2894} 2895 2896define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 2897; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 2898; GFX12: ; %bb.0: 2899; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2900; GFX12-NEXT: s_wait_expcnt 0x0 2901; GFX12-NEXT: s_wait_samplecnt 0x0 2902; GFX12-NEXT: s_wait_bvhcnt 0x0 2903; GFX12-NEXT: s_wait_kmcnt 0x0 2904; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 2905; GFX12-NEXT: v_mov_b32_e32 v0, s16 2906; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 2907; GFX12-NEXT: s_wait_alu 0xfffe 2908; GFX12-NEXT: v_mov_b32_e32 v6, s4 2909; GFX12-NEXT: s_mov_b32 s4, 0 2910; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 2911; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start 2912; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2913; GFX12-NEXT: s_wait_loadcnt 0x0 2914; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2915; GFX12-NEXT: s_wait_storecnt 0x0 2916; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2917; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] 2918; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2919; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2920; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 2921; GFX12-NEXT: s_wait_loadcnt 0x0 2922; GFX12-NEXT: global_inv scope:SCOPE_DEV 2923; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2924; GFX12-NEXT: s_wait_alu 0xfffe 2925; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 2926; GFX12-NEXT: s_wait_alu 0xfffe 2927; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2928; GFX12-NEXT: s_cbranch_execnz .LBB11_1 2929; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2930; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 2931; GFX12-NEXT: s_wait_alu 0xfffe 2932; GFX12-NEXT: s_setpc_b64 s[30:31] 2933; 2934; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 2935; GFX940: ; %bb.0: 2936; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2937; GFX940-NEXT: v_mov_b32_e32 v2, s16 2938; GFX940-NEXT: buffer_wbl2 sc1 2939; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 2940; GFX940-NEXT: s_waitcnt vmcnt(0) 2941; GFX940-NEXT: buffer_inv sc1 2942; GFX940-NEXT: s_setpc_b64 s[30:31] 2943; 2944; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 2945; GFX11: ; %bb.0: 2946; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2947; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 2948; GFX11-NEXT: v_mov_b32_e32 v0, s16 2949; GFX11-NEXT: s_add_i32 s4, s16, 0x800 2950; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2951; GFX11-NEXT: v_mov_b32_e32 v6, s4 2952; GFX11-NEXT: s_mov_b32 s4, 0 2953; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 2954; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 2955; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2956; GFX11-NEXT: s_waitcnt vmcnt(0) 2957; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2958; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2959; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2960; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2961; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2962; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2963; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 2964; GFX11-NEXT: s_waitcnt vmcnt(0) 2965; GFX11-NEXT: buffer_gl1_inv 2966; GFX11-NEXT: buffer_gl0_inv 2967; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2968; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 2969; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2970; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2971; GFX11-NEXT: s_cbranch_execnz .LBB11_1 2972; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2973; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 2974; GFX11-NEXT: s_setpc_b64 s[30:31] 2975; 2976; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 2977; GFX10: ; %bb.0: 2978; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2979; GFX10-NEXT: v_mov_b32_e32 v4, v0 2980; GFX10-NEXT: v_mov_b32_e32 v0, s20 2981; GFX10-NEXT: v_mov_b32_e32 v5, v1 2982; GFX10-NEXT: s_add_i32 s4, s20, 0x800 2983; GFX10-NEXT: v_mov_b32_e32 v6, s4 2984; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2985; GFX10-NEXT: s_mov_b32 s4, 0 2986; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start 2987; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2988; GFX10-NEXT: s_waitcnt vmcnt(0) 2989; GFX10-NEXT: v_mov_b32_e32 v10, v1 2990; GFX10-NEXT: v_mov_b32_e32 v9, v0 2991; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2992; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 2993; GFX10-NEXT: v_mov_b32_e32 v0, v7 2994; GFX10-NEXT: v_mov_b32_e32 v1, v8 2995; GFX10-NEXT: v_mov_b32_e32 v2, v9 2996; GFX10-NEXT: v_mov_b32_e32 v3, v10 2997; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2998; GFX10-NEXT: s_waitcnt vmcnt(0) 2999; GFX10-NEXT: buffer_gl1_inv 3000; GFX10-NEXT: buffer_gl0_inv 3001; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 3002; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3003; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3004; GFX10-NEXT: s_cbranch_execnz .LBB11_1 3005; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3006; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3007; GFX10-NEXT: s_setpc_b64 s[30:31] 3008; 3009; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 3010; GFX90A: ; %bb.0: 3011; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3012; GFX90A-NEXT: v_mov_b32_e32 v4, v0 3013; GFX90A-NEXT: v_mov_b32_e32 v0, s20 3014; GFX90A-NEXT: v_mov_b32_e32 v5, v1 3015; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3016; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 3017; GFX90A-NEXT: s_mov_b64 s[4:5], 0 3018; GFX90A-NEXT: v_mov_b32_e32 v6, s6 3019; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start 3020; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3021; GFX90A-NEXT: s_waitcnt vmcnt(0) 3022; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] 3023; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] 3024; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] 3025; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] 3026; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3027; GFX90A-NEXT: s_waitcnt vmcnt(0) 3028; GFX90A-NEXT: buffer_wbinvl1 3029; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 3030; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3031; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 3032; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 3033; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 3034; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3035; GFX90A-NEXT: s_setpc_b64 s[30:31] 3036; 3037; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 3038; GFX908: ; %bb.0: 3039; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3040; GFX908-NEXT: v_mov_b32_e32 v4, v0 3041; GFX908-NEXT: v_mov_b32_e32 v0, s20 3042; GFX908-NEXT: v_mov_b32_e32 v5, v1 3043; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3044; GFX908-NEXT: s_add_i32 s6, s20, 0x800 3045; GFX908-NEXT: s_mov_b64 s[4:5], 0 3046; GFX908-NEXT: v_mov_b32_e32 v6, s6 3047; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 3048; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3049; GFX908-NEXT: s_waitcnt vmcnt(0) 3050; GFX908-NEXT: v_mov_b32_e32 v10, v1 3051; GFX908-NEXT: v_mov_b32_e32 v9, v0 3052; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3053; GFX908-NEXT: v_mov_b32_e32 v0, v7 3054; GFX908-NEXT: v_mov_b32_e32 v1, v8 3055; GFX908-NEXT: v_mov_b32_e32 v2, v9 3056; GFX908-NEXT: v_mov_b32_e32 v3, v10 3057; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3058; GFX908-NEXT: s_waitcnt vmcnt(0) 3059; GFX908-NEXT: buffer_wbinvl1 3060; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3061; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3062; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3063; GFX908-NEXT: s_cbranch_execnz .LBB11_1 3064; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3065; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3066; GFX908-NEXT: s_setpc_b64 s[30:31] 3067; 3068; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 3069; GFX8: ; %bb.0: 3070; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3071; GFX8-NEXT: v_mov_b32_e32 v4, v0 3072; GFX8-NEXT: v_mov_b32_e32 v0, s20 3073; GFX8-NEXT: v_mov_b32_e32 v5, v1 3074; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3075; GFX8-NEXT: s_add_i32 s6, s20, 0x800 3076; GFX8-NEXT: s_mov_b64 s[4:5], 0 3077; GFX8-NEXT: v_mov_b32_e32 v6, s6 3078; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 3079; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3080; GFX8-NEXT: s_waitcnt vmcnt(0) 3081; GFX8-NEXT: v_mov_b32_e32 v10, v1 3082; GFX8-NEXT: v_mov_b32_e32 v9, v0 3083; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3084; GFX8-NEXT: v_mov_b32_e32 v0, v7 3085; GFX8-NEXT: v_mov_b32_e32 v1, v8 3086; GFX8-NEXT: v_mov_b32_e32 v2, v9 3087; GFX8-NEXT: v_mov_b32_e32 v3, v10 3088; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3089; GFX8-NEXT: s_waitcnt vmcnt(0) 3090; GFX8-NEXT: buffer_wbinvl1 3091; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3092; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3093; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3094; GFX8-NEXT: s_cbranch_execnz .LBB11_1 3095; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3096; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3097; GFX8-NEXT: s_setpc_b64 s[30:31] 3098; 3099; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 3100; GFX7: ; %bb.0: 3101; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3102; GFX7-NEXT: v_mov_b32_e32 v4, v0 3103; GFX7-NEXT: v_mov_b32_e32 v0, s20 3104; GFX7-NEXT: v_mov_b32_e32 v5, v1 3105; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3106; GFX7-NEXT: s_add_i32 s6, s20, 0x800 3107; GFX7-NEXT: s_mov_b64 s[4:5], 0 3108; GFX7-NEXT: v_mov_b32_e32 v6, s6 3109; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start 3110; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3111; GFX7-NEXT: s_waitcnt vmcnt(0) 3112; GFX7-NEXT: v_mov_b32_e32 v10, v1 3113; GFX7-NEXT: v_mov_b32_e32 v9, v0 3114; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3115; GFX7-NEXT: v_mov_b32_e32 v0, v7 3116; GFX7-NEXT: v_mov_b32_e32 v1, v8 3117; GFX7-NEXT: v_mov_b32_e32 v2, v9 3118; GFX7-NEXT: v_mov_b32_e32 v3, v10 3119; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3120; GFX7-NEXT: s_waitcnt vmcnt(0) 3121; GFX7-NEXT: buffer_wbinvl1 3122; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3123; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3124; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3125; GFX7-NEXT: s_cbranch_execnz .LBB11_1 3126; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3127; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3128; GFX7-NEXT: s_setpc_b64 s[30:31] 3129; 3130; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: 3131; GFX6: ; %bb.0: 3132; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3133; GFX6-NEXT: v_mov_b32_e32 v4, v0 3134; GFX6-NEXT: v_mov_b32_e32 v0, s20 3135; GFX6-NEXT: v_mov_b32_e32 v5, v1 3136; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3137; GFX6-NEXT: s_add_i32 s6, s20, 0x800 3138; GFX6-NEXT: s_mov_b64 s[4:5], 0 3139; GFX6-NEXT: v_mov_b32_e32 v6, s6 3140; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start 3141; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 3142; GFX6-NEXT: s_waitcnt vmcnt(0) 3143; GFX6-NEXT: v_mov_b32_e32 v10, v1 3144; GFX6-NEXT: v_mov_b32_e32 v9, v0 3145; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3146; GFX6-NEXT: s_waitcnt expcnt(0) 3147; GFX6-NEXT: v_mov_b32_e32 v0, v7 3148; GFX6-NEXT: v_mov_b32_e32 v1, v8 3149; GFX6-NEXT: v_mov_b32_e32 v2, v9 3150; GFX6-NEXT: v_mov_b32_e32 v3, v10 3151; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3152; GFX6-NEXT: s_waitcnt vmcnt(0) 3153; GFX6-NEXT: buffer_wbinvl1 3154; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3155; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3156; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 3157; GFX6-NEXT: s_cbranch_execnz .LBB11_1 3158; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 3159; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 3160; GFX6-NEXT: s_waitcnt expcnt(0) 3161; GFX6-NEXT: s_setpc_b64 s[30:31] 3162 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 3163 %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 3164 ret double %result 3165} 3166 3167define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 3168; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3169; GFX12: ; %bb.0: 3170; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3171; GFX12-NEXT: s_wait_expcnt 0x0 3172; GFX12-NEXT: s_wait_samplecnt 0x0 3173; GFX12-NEXT: s_wait_bvhcnt 0x0 3174; GFX12-NEXT: s_wait_kmcnt 0x0 3175; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 3176; GFX12-NEXT: v_mov_b32_e32 v0, s16 3177; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 3178; GFX12-NEXT: s_wait_alu 0xfffe 3179; GFX12-NEXT: v_mov_b32_e32 v6, s4 3180; GFX12-NEXT: s_mov_b32 s4, 0 3181; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 3182; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start 3183; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3184; GFX12-NEXT: s_wait_loadcnt 0x0 3185; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 3186; GFX12-NEXT: s_wait_storecnt 0x0 3187; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3188; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] 3189; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 3190; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 3191; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 3192; GFX12-NEXT: s_wait_loadcnt 0x0 3193; GFX12-NEXT: global_inv scope:SCOPE_DEV 3194; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 3195; GFX12-NEXT: s_wait_alu 0xfffe 3196; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 3197; GFX12-NEXT: s_wait_alu 0xfffe 3198; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 3199; GFX12-NEXT: s_cbranch_execnz .LBB12_1 3200; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3201; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 3202; GFX12-NEXT: s_wait_alu 0xfffe 3203; GFX12-NEXT: s_setpc_b64 s[30:31] 3204; 3205; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3206; GFX940: ; %bb.0: 3207; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3208; GFX940-NEXT: v_mov_b32_e32 v2, s16 3209; GFX940-NEXT: buffer_wbl2 sc1 3210; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 3211; GFX940-NEXT: s_waitcnt vmcnt(0) 3212; GFX940-NEXT: buffer_inv sc1 3213; GFX940-NEXT: s_setpc_b64 s[30:31] 3214; 3215; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3216; GFX11: ; %bb.0: 3217; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3218; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 3219; GFX11-NEXT: v_mov_b32_e32 v0, s16 3220; GFX11-NEXT: s_add_i32 s4, s16, 0x800 3221; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3222; GFX11-NEXT: v_mov_b32_e32 v6, s4 3223; GFX11-NEXT: s_mov_b32 s4, 0 3224; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 3225; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start 3226; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3227; GFX11-NEXT: s_waitcnt vmcnt(0) 3228; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 3229; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3230; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3231; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3232; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 3233; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 3234; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 3235; GFX11-NEXT: s_waitcnt vmcnt(0) 3236; GFX11-NEXT: buffer_gl1_inv 3237; GFX11-NEXT: buffer_gl0_inv 3238; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 3239; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 3240; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3241; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 3242; GFX11-NEXT: s_cbranch_execnz .LBB12_1 3243; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3244; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 3245; GFX11-NEXT: s_setpc_b64 s[30:31] 3246; 3247; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3248; GFX10: ; %bb.0: 3249; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3250; GFX10-NEXT: v_mov_b32_e32 v4, v0 3251; GFX10-NEXT: v_mov_b32_e32 v0, s20 3252; GFX10-NEXT: v_mov_b32_e32 v5, v1 3253; GFX10-NEXT: s_add_i32 s4, s20, 0x800 3254; GFX10-NEXT: v_mov_b32_e32 v6, s4 3255; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3256; GFX10-NEXT: s_mov_b32 s4, 0 3257; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start 3258; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3259; GFX10-NEXT: s_waitcnt vmcnt(0) 3260; GFX10-NEXT: v_mov_b32_e32 v10, v1 3261; GFX10-NEXT: v_mov_b32_e32 v9, v0 3262; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3263; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3264; GFX10-NEXT: v_mov_b32_e32 v0, v7 3265; GFX10-NEXT: v_mov_b32_e32 v1, v8 3266; GFX10-NEXT: v_mov_b32_e32 v2, v9 3267; GFX10-NEXT: v_mov_b32_e32 v3, v10 3268; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3269; GFX10-NEXT: s_waitcnt vmcnt(0) 3270; GFX10-NEXT: buffer_gl1_inv 3271; GFX10-NEXT: buffer_gl0_inv 3272; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 3273; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3274; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3275; GFX10-NEXT: s_cbranch_execnz .LBB12_1 3276; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3277; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3278; GFX10-NEXT: s_setpc_b64 s[30:31] 3279; 3280; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3281; GFX90A: ; %bb.0: 3282; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3283; GFX90A-NEXT: v_mov_b32_e32 v2, s20 3284; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 3285; GFX90A-NEXT: s_waitcnt vmcnt(0) 3286; GFX90A-NEXT: buffer_wbinvl1 3287; GFX90A-NEXT: s_setpc_b64 s[30:31] 3288; 3289; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3290; GFX908: ; %bb.0: 3291; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3292; GFX908-NEXT: v_mov_b32_e32 v4, v0 3293; GFX908-NEXT: v_mov_b32_e32 v0, s20 3294; GFX908-NEXT: v_mov_b32_e32 v5, v1 3295; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3296; GFX908-NEXT: s_add_i32 s6, s20, 0x800 3297; GFX908-NEXT: s_mov_b64 s[4:5], 0 3298; GFX908-NEXT: v_mov_b32_e32 v6, s6 3299; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start 3300; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3301; GFX908-NEXT: s_waitcnt vmcnt(0) 3302; GFX908-NEXT: v_mov_b32_e32 v10, v1 3303; GFX908-NEXT: v_mov_b32_e32 v9, v0 3304; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3305; GFX908-NEXT: v_mov_b32_e32 v0, v7 3306; GFX908-NEXT: v_mov_b32_e32 v1, v8 3307; GFX908-NEXT: v_mov_b32_e32 v2, v9 3308; GFX908-NEXT: v_mov_b32_e32 v3, v10 3309; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3310; GFX908-NEXT: s_waitcnt vmcnt(0) 3311; GFX908-NEXT: buffer_wbinvl1 3312; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3313; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3314; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3315; GFX908-NEXT: s_cbranch_execnz .LBB12_1 3316; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3317; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3318; GFX908-NEXT: s_setpc_b64 s[30:31] 3319; 3320; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3321; GFX8: ; %bb.0: 3322; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3323; GFX8-NEXT: v_mov_b32_e32 v4, v0 3324; GFX8-NEXT: v_mov_b32_e32 v0, s20 3325; GFX8-NEXT: v_mov_b32_e32 v5, v1 3326; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3327; GFX8-NEXT: s_add_i32 s6, s20, 0x800 3328; GFX8-NEXT: s_mov_b64 s[4:5], 0 3329; GFX8-NEXT: v_mov_b32_e32 v6, s6 3330; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start 3331; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3332; GFX8-NEXT: s_waitcnt vmcnt(0) 3333; GFX8-NEXT: v_mov_b32_e32 v10, v1 3334; GFX8-NEXT: v_mov_b32_e32 v9, v0 3335; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3336; GFX8-NEXT: v_mov_b32_e32 v0, v7 3337; GFX8-NEXT: v_mov_b32_e32 v1, v8 3338; GFX8-NEXT: v_mov_b32_e32 v2, v9 3339; GFX8-NEXT: v_mov_b32_e32 v3, v10 3340; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3341; GFX8-NEXT: s_waitcnt vmcnt(0) 3342; GFX8-NEXT: buffer_wbinvl1 3343; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3344; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3345; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3346; GFX8-NEXT: s_cbranch_execnz .LBB12_1 3347; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3348; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3349; GFX8-NEXT: s_setpc_b64 s[30:31] 3350; 3351; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3352; GFX7: ; %bb.0: 3353; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3354; GFX7-NEXT: v_mov_b32_e32 v4, v0 3355; GFX7-NEXT: v_mov_b32_e32 v0, s20 3356; GFX7-NEXT: v_mov_b32_e32 v5, v1 3357; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3358; GFX7-NEXT: s_add_i32 s6, s20, 0x800 3359; GFX7-NEXT: s_mov_b64 s[4:5], 0 3360; GFX7-NEXT: v_mov_b32_e32 v6, s6 3361; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start 3362; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3363; GFX7-NEXT: s_waitcnt vmcnt(0) 3364; GFX7-NEXT: v_mov_b32_e32 v10, v1 3365; GFX7-NEXT: v_mov_b32_e32 v9, v0 3366; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3367; GFX7-NEXT: v_mov_b32_e32 v0, v7 3368; GFX7-NEXT: v_mov_b32_e32 v1, v8 3369; GFX7-NEXT: v_mov_b32_e32 v2, v9 3370; GFX7-NEXT: v_mov_b32_e32 v3, v10 3371; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3372; GFX7-NEXT: s_waitcnt vmcnt(0) 3373; GFX7-NEXT: buffer_wbinvl1 3374; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3375; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3376; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3377; GFX7-NEXT: s_cbranch_execnz .LBB12_1 3378; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3379; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3380; GFX7-NEXT: s_setpc_b64 s[30:31] 3381; 3382; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 3383; GFX6: ; %bb.0: 3384; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3385; GFX6-NEXT: v_mov_b32_e32 v4, v0 3386; GFX6-NEXT: v_mov_b32_e32 v0, s20 3387; GFX6-NEXT: v_mov_b32_e32 v5, v1 3388; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 3389; GFX6-NEXT: s_add_i32 s6, s20, 0x800 3390; GFX6-NEXT: s_mov_b64 s[4:5], 0 3391; GFX6-NEXT: v_mov_b32_e32 v6, s6 3392; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start 3393; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 3394; GFX6-NEXT: s_waitcnt vmcnt(0) 3395; GFX6-NEXT: v_mov_b32_e32 v10, v1 3396; GFX6-NEXT: v_mov_b32_e32 v9, v0 3397; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] 3398; GFX6-NEXT: s_waitcnt expcnt(0) 3399; GFX6-NEXT: v_mov_b32_e32 v0, v7 3400; GFX6-NEXT: v_mov_b32_e32 v1, v8 3401; GFX6-NEXT: v_mov_b32_e32 v2, v9 3402; GFX6-NEXT: v_mov_b32_e32 v3, v10 3403; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 3404; GFX6-NEXT: s_waitcnt vmcnt(0) 3405; GFX6-NEXT: buffer_wbinvl1 3406; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 3407; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3408; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 3409; GFX6-NEXT: s_cbranch_execnz .LBB12_1 3410; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 3411; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 3412; GFX6-NEXT: s_waitcnt expcnt(0) 3413; GFX6-NEXT: s_setpc_b64 s[30:31] 3414 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 3415 %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 3416 ret double %result 3417} 3418 3419; -------------------------------------------------------------------- 3420; half 3421; -------------------------------------------------------------------- 3422 3423define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { 3424; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3425; GFX12: ; %bb.0: 3426; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3427; GFX12-NEXT: s_wait_expcnt 0x0 3428; GFX12-NEXT: s_wait_samplecnt 0x0 3429; GFX12-NEXT: s_wait_bvhcnt 0x0 3430; GFX12-NEXT: s_wait_kmcnt 0x0 3431; GFX12-NEXT: s_addk_co_i32 s16, 0x200 3432; GFX12-NEXT: s_wait_alu 0xfffe 3433; GFX12-NEXT: s_and_b32 s4, s16, -4 3434; GFX12-NEXT: s_wait_alu 0xfffe 3435; GFX12-NEXT: v_mov_b32_e32 v5, s4 3436; GFX12-NEXT: s_and_b32 s4, s16, 3 3437; GFX12-NEXT: s_wait_alu 0xfffe 3438; GFX12-NEXT: s_lshl_b32 s4, s4, 3 3439; GFX12-NEXT: s_wait_alu 0xfffe 3440; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 3441; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen 3442; GFX12-NEXT: s_wait_alu 0xfffe 3443; GFX12-NEXT: s_not_b32 s6, s5 3444; GFX12-NEXT: s_mov_b32 s5, 0 3445; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start 3446; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3447; GFX12-NEXT: s_wait_loadcnt 0x0 3448; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3449; GFX12-NEXT: s_wait_storecnt 0x0 3450; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3451; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 3452; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 3453; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3454; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 3455; GFX12-NEXT: s_wait_alu 0xfffe 3456; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 3457; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3458; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 3459; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN 3460; GFX12-NEXT: s_wait_loadcnt 0x0 3461; GFX12-NEXT: global_inv scope:SCOPE_DEV 3462; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 3463; GFX12-NEXT: v_mov_b32_e32 v2, v3 3464; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 3465; GFX12-NEXT: s_wait_alu 0xfffe 3466; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 3467; GFX12-NEXT: s_cbranch_execnz .LBB13_1 3468; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3469; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 3470; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 3471; GFX12-NEXT: s_wait_alu 0xfffe 3472; GFX12-NEXT: s_setpc_b64 s[30:31] 3473; 3474; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3475; GFX940: ; %bb.0: 3476; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3477; GFX940-NEXT: s_addk_i32 s16, 0x200 3478; GFX940-NEXT: s_and_b32 s4, s16, -4 3479; GFX940-NEXT: v_mov_b32_e32 v1, s4 3480; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen 3481; GFX940-NEXT: s_and_b32 s4, s16, 3 3482; GFX940-NEXT: s_lshl_b32 s6, s4, 3 3483; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 3484; GFX940-NEXT: s_not_b32 s7, s4 3485; GFX940-NEXT: s_mov_b64 s[4:5], 0 3486; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start 3487; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3488; GFX940-NEXT: s_waitcnt vmcnt(0) 3489; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 3490; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 3491; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 3492; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 3493; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] 3494; GFX940-NEXT: buffer_wbl2 sc1 3495; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 3496; GFX940-NEXT: s_waitcnt vmcnt(0) 3497; GFX940-NEXT: buffer_inv sc1 3498; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 3499; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3500; GFX940-NEXT: v_mov_b32_e32 v3, v4 3501; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 3502; GFX940-NEXT: s_cbranch_execnz .LBB13_1 3503; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 3504; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 3505; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 3506; GFX940-NEXT: s_setpc_b64 s[30:31] 3507; 3508; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3509; GFX11: ; %bb.0: 3510; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3511; GFX11-NEXT: s_addk_i32 s16, 0x200 3512; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3513; GFX11-NEXT: s_and_b32 s4, s16, -4 3514; GFX11-NEXT: v_mov_b32_e32 v5, s4 3515; GFX11-NEXT: s_and_b32 s4, s16, 3 3516; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3517; GFX11-NEXT: s_lshl_b32 s4, s4, 3 3518; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 3519; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen 3520; GFX11-NEXT: s_not_b32 s6, s5 3521; GFX11-NEXT: s_mov_b32 s5, 0 3522; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start 3523; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3524; GFX11-NEXT: s_waitcnt vmcnt(0) 3525; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3526; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3527; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3528; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 3529; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 3530; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3531; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 3532; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 3533; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3534; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 3535; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc 3536; GFX11-NEXT: s_waitcnt vmcnt(0) 3537; GFX11-NEXT: buffer_gl1_inv 3538; GFX11-NEXT: buffer_gl0_inv 3539; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 3540; GFX11-NEXT: v_mov_b32_e32 v2, v3 3541; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 3542; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3543; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 3544; GFX11-NEXT: s_cbranch_execnz .LBB13_1 3545; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3546; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 3547; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 3548; GFX11-NEXT: s_setpc_b64 s[30:31] 3549; 3550; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3551; GFX10: ; %bb.0: 3552; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3553; GFX10-NEXT: s_addk_i32 s20, 0x200 3554; GFX10-NEXT: s_and_b32 s4, s20, -4 3555; GFX10-NEXT: v_mov_b32_e32 v5, s4 3556; GFX10-NEXT: s_and_b32 s4, s20, 3 3557; GFX10-NEXT: s_lshl_b32 s4, s4, 3 3558; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 3559; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen 3560; GFX10-NEXT: s_not_b32 s6, s5 3561; GFX10-NEXT: s_mov_b32 s5, 0 3562; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start 3563; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3564; GFX10-NEXT: s_waitcnt vmcnt(0) 3565; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3566; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3567; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 3568; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3569; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 3570; GFX10-NEXT: v_mov_b32_e32 v4, v2 3571; GFX10-NEXT: v_mov_b32_e32 v3, v1 3572; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc 3573; GFX10-NEXT: s_waitcnt vmcnt(0) 3574; GFX10-NEXT: buffer_gl1_inv 3575; GFX10-NEXT: buffer_gl0_inv 3576; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 3577; GFX10-NEXT: v_mov_b32_e32 v2, v3 3578; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3579; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3580; GFX10-NEXT: s_cbranch_execnz .LBB13_1 3581; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3582; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3583; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 3584; GFX10-NEXT: s_setpc_b64 s[30:31] 3585; 3586; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3587; GFX90A: ; %bb.0: 3588; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3589; GFX90A-NEXT: s_addk_i32 s20, 0x200 3590; GFX90A-NEXT: s_and_b32 s4, s20, -4 3591; GFX90A-NEXT: v_mov_b32_e32 v1, s4 3592; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen 3593; GFX90A-NEXT: s_and_b32 s4, s20, 3 3594; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 3595; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 3596; GFX90A-NEXT: s_not_b32 s7, s4 3597; GFX90A-NEXT: s_mov_b64 s[4:5], 0 3598; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start 3599; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3600; GFX90A-NEXT: s_waitcnt vmcnt(0) 3601; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 3602; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 3603; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 3604; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 3605; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 3606; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc 3607; GFX90A-NEXT: s_waitcnt vmcnt(0) 3608; GFX90A-NEXT: buffer_wbinvl1 3609; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 3610; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3611; GFX90A-NEXT: v_mov_b32_e32 v3, v4 3612; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 3613; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 3614; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 3615; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3616; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 3617; GFX90A-NEXT: s_setpc_b64 s[30:31] 3618; 3619; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3620; GFX908: ; %bb.0: 3621; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3622; GFX908-NEXT: s_addk_i32 s20, 0x200 3623; GFX908-NEXT: s_and_b32 s4, s20, -4 3624; GFX908-NEXT: v_mov_b32_e32 v5, s4 3625; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen 3626; GFX908-NEXT: s_and_b32 s4, s20, 3 3627; GFX908-NEXT: s_lshl_b32 s6, s4, 3 3628; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 3629; GFX908-NEXT: s_not_b32 s7, s4 3630; GFX908-NEXT: s_mov_b64 s[4:5], 0 3631; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 3632; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3633; GFX908-NEXT: s_waitcnt vmcnt(0) 3634; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 3635; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 3636; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 3637; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 3638; GFX908-NEXT: v_mov_b32_e32 v4, v2 3639; GFX908-NEXT: v_mov_b32_e32 v3, v1 3640; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc 3641; GFX908-NEXT: s_waitcnt vmcnt(0) 3642; GFX908-NEXT: buffer_wbinvl1 3643; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 3644; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3645; GFX908-NEXT: v_mov_b32_e32 v2, v3 3646; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3647; GFX908-NEXT: s_cbranch_execnz .LBB13_1 3648; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3649; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3650; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 3651; GFX908-NEXT: s_setpc_b64 s[30:31] 3652; 3653; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3654; GFX8: ; %bb.0: 3655; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3656; GFX8-NEXT: s_addk_i32 s20, 0x200 3657; GFX8-NEXT: s_and_b32 s4, s20, -4 3658; GFX8-NEXT: v_mov_b32_e32 v5, s4 3659; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen 3660; GFX8-NEXT: s_and_b32 s4, s20, 3 3661; GFX8-NEXT: s_lshl_b32 s6, s4, 3 3662; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 3663; GFX8-NEXT: s_not_b32 s7, s4 3664; GFX8-NEXT: s_mov_b64 s[4:5], 0 3665; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 3666; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3667; GFX8-NEXT: s_waitcnt vmcnt(0) 3668; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 3669; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 3670; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 3671; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 3672; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 3673; GFX8-NEXT: v_mov_b32_e32 v4, v2 3674; GFX8-NEXT: v_mov_b32_e32 v3, v1 3675; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc 3676; GFX8-NEXT: s_waitcnt vmcnt(0) 3677; GFX8-NEXT: buffer_wbinvl1 3678; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 3679; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3680; GFX8-NEXT: v_mov_b32_e32 v2, v3 3681; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3682; GFX8-NEXT: s_cbranch_execnz .LBB13_1 3683; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3684; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3685; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 3686; GFX8-NEXT: s_setpc_b64 s[30:31] 3687; 3688; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3689; GFX7: ; %bb.0: 3690; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3691; GFX7-NEXT: s_addk_i32 s20, 0x200 3692; GFX7-NEXT: s_and_b32 s4, s20, -4 3693; GFX7-NEXT: v_mov_b32_e32 v4, s4 3694; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 3695; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 3696; GFX7-NEXT: s_and_b32 s4, s20, 3 3697; GFX7-NEXT: s_lshl_b32 s6, s4, 3 3698; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 3699; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 3700; GFX7-NEXT: s_not_b32 s7, s4 3701; GFX7-NEXT: s_mov_b64 s[4:5], 0 3702; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start 3703; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3704; GFX7-NEXT: s_waitcnt vmcnt(0) 3705; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3706; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 3707; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 3708; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 3709; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 3710; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3711; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 3712; GFX7-NEXT: v_mov_b32_e32 v3, v1 3713; GFX7-NEXT: v_mov_b32_e32 v2, v0 3714; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 3715; GFX7-NEXT: s_waitcnt vmcnt(0) 3716; GFX7-NEXT: buffer_wbinvl1 3717; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 3718; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3719; GFX7-NEXT: v_mov_b32_e32 v1, v2 3720; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3721; GFX7-NEXT: s_cbranch_execnz .LBB13_1 3722; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3723; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3724; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 3725; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 3726; GFX7-NEXT: s_setpc_b64 s[30:31] 3727; 3728; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: 3729; GFX6: ; %bb.0: 3730; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3731; GFX6-NEXT: s_addk_i32 s20, 0x200 3732; GFX6-NEXT: s_and_b32 s4, s20, -4 3733; GFX6-NEXT: v_mov_b32_e32 v4, s4 3734; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 3735; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 3736; GFX6-NEXT: s_and_b32 s4, s20, 3 3737; GFX6-NEXT: s_lshl_b32 s6, s4, 3 3738; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 3739; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 3740; GFX6-NEXT: s_not_b32 s7, s4 3741; GFX6-NEXT: s_mov_b64 s[4:5], 0 3742; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start 3743; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 3744; GFX6-NEXT: s_waitcnt vmcnt(0) 3745; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3746; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 3747; GFX6-NEXT: s_waitcnt expcnt(0) 3748; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 3749; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 3750; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 3751; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3752; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 3753; GFX6-NEXT: v_mov_b32_e32 v3, v1 3754; GFX6-NEXT: v_mov_b32_e32 v2, v0 3755; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 3756; GFX6-NEXT: s_waitcnt vmcnt(0) 3757; GFX6-NEXT: buffer_wbinvl1 3758; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 3759; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3760; GFX6-NEXT: v_mov_b32_e32 v1, v2 3761; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 3762; GFX6-NEXT: s_cbranch_execnz .LBB13_1 3763; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 3764; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 3765; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 3766; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 3767; GFX6-NEXT: s_waitcnt expcnt(0) 3768; GFX6-NEXT: s_setpc_b64 s[30:31] 3769 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 3770 %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3771 ret half %result 3772} 3773 3774define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { 3775; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3776; GFX12: ; %bb.0: 3777; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3778; GFX12-NEXT: s_wait_expcnt 0x0 3779; GFX12-NEXT: s_wait_samplecnt 0x0 3780; GFX12-NEXT: s_wait_bvhcnt 0x0 3781; GFX12-NEXT: s_wait_kmcnt 0x0 3782; GFX12-NEXT: s_addk_co_i32 s16, 0x200 3783; GFX12-NEXT: s_wait_alu 0xfffe 3784; GFX12-NEXT: s_and_b32 s4, s16, -4 3785; GFX12-NEXT: s_wait_alu 0xfffe 3786; GFX12-NEXT: v_mov_b32_e32 v3, s4 3787; GFX12-NEXT: s_and_b32 s4, s16, 3 3788; GFX12-NEXT: s_wait_alu 0xfffe 3789; GFX12-NEXT: s_lshl_b32 s4, s4, 3 3790; GFX12-NEXT: s_wait_alu 0xfffe 3791; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 3792; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen 3793; GFX12-NEXT: s_wait_alu 0xfffe 3794; GFX12-NEXT: s_not_b32 s6, s5 3795; GFX12-NEXT: s_mov_b32 s5, 0 3796; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start 3797; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3798; GFX12-NEXT: s_wait_loadcnt 0x0 3799; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3800; GFX12-NEXT: s_wait_storecnt 0x0 3801; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3802; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 3803; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 3804; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3805; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 3806; GFX12-NEXT: s_wait_alu 0xfffe 3807; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 3808; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3809; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 3810; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN 3811; GFX12-NEXT: s_wait_loadcnt 0x0 3812; GFX12-NEXT: global_inv scope:SCOPE_DEV 3813; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 3814; GFX12-NEXT: v_mov_b32_e32 v2, v4 3815; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 3816; GFX12-NEXT: s_wait_alu 0xfffe 3817; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 3818; GFX12-NEXT: s_cbranch_execnz .LBB14_1 3819; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3820; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 3821; GFX12-NEXT: s_wait_alu 0xfffe 3822; GFX12-NEXT: s_setpc_b64 s[30:31] 3823; 3824; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3825; GFX940: ; %bb.0: 3826; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3827; GFX940-NEXT: s_addk_i32 s16, 0x200 3828; GFX940-NEXT: s_and_b32 s4, s16, -4 3829; GFX940-NEXT: v_mov_b32_e32 v1, s4 3830; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen 3831; GFX940-NEXT: s_and_b32 s4, s16, 3 3832; GFX940-NEXT: s_lshl_b32 s6, s4, 3 3833; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 3834; GFX940-NEXT: s_not_b32 s7, s4 3835; GFX940-NEXT: s_mov_b64 s[4:5], 0 3836; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start 3837; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3838; GFX940-NEXT: s_waitcnt vmcnt(0) 3839; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 3840; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 3841; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 3842; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 3843; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] 3844; GFX940-NEXT: buffer_wbl2 sc1 3845; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 3846; GFX940-NEXT: s_waitcnt vmcnt(0) 3847; GFX940-NEXT: buffer_inv sc1 3848; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 3849; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3850; GFX940-NEXT: v_mov_b32_e32 v3, v4 3851; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 3852; GFX940-NEXT: s_cbranch_execnz .LBB14_1 3853; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 3854; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 3855; GFX940-NEXT: s_setpc_b64 s[30:31] 3856; 3857; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3858; GFX11: ; %bb.0: 3859; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3860; GFX11-NEXT: s_addk_i32 s16, 0x200 3861; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3862; GFX11-NEXT: s_and_b32 s4, s16, -4 3863; GFX11-NEXT: v_mov_b32_e32 v3, s4 3864; GFX11-NEXT: s_and_b32 s4, s16, 3 3865; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3866; GFX11-NEXT: s_lshl_b32 s4, s4, 3 3867; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 3868; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen 3869; GFX11-NEXT: s_not_b32 s6, s5 3870; GFX11-NEXT: s_mov_b32 s5, 0 3871; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start 3872; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3873; GFX11-NEXT: s_waitcnt vmcnt(0) 3874; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3875; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3876; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3877; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 3878; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 3879; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3880; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 3881; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 3882; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3883; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 3884; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc 3885; GFX11-NEXT: s_waitcnt vmcnt(0) 3886; GFX11-NEXT: buffer_gl1_inv 3887; GFX11-NEXT: buffer_gl0_inv 3888; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 3889; GFX11-NEXT: v_mov_b32_e32 v2, v4 3890; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 3891; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3892; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 3893; GFX11-NEXT: s_cbranch_execnz .LBB14_1 3894; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3895; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 3896; GFX11-NEXT: s_setpc_b64 s[30:31] 3897; 3898; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3899; GFX10: ; %bb.0: 3900; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3901; GFX10-NEXT: s_addk_i32 s20, 0x200 3902; GFX10-NEXT: s_and_b32 s4, s20, -4 3903; GFX10-NEXT: v_mov_b32_e32 v3, s4 3904; GFX10-NEXT: s_and_b32 s4, s20, 3 3905; GFX10-NEXT: s_lshl_b32 s4, s4, 3 3906; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 3907; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen 3908; GFX10-NEXT: s_not_b32 s6, s5 3909; GFX10-NEXT: s_mov_b32 s5, 0 3910; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start 3911; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3912; GFX10-NEXT: s_waitcnt vmcnt(0) 3913; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 3914; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3915; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 3916; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3917; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 3918; GFX10-NEXT: v_mov_b32_e32 v5, v2 3919; GFX10-NEXT: v_mov_b32_e32 v4, v1 3920; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 3921; GFX10-NEXT: s_waitcnt vmcnt(0) 3922; GFX10-NEXT: buffer_gl1_inv 3923; GFX10-NEXT: buffer_gl0_inv 3924; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 3925; GFX10-NEXT: v_mov_b32_e32 v2, v4 3926; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3927; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3928; GFX10-NEXT: s_cbranch_execnz .LBB14_1 3929; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3930; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3931; GFX10-NEXT: s_setpc_b64 s[30:31] 3932; 3933; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3934; GFX90A: ; %bb.0: 3935; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3936; GFX90A-NEXT: s_addk_i32 s20, 0x200 3937; GFX90A-NEXT: s_and_b32 s4, s20, -4 3938; GFX90A-NEXT: v_mov_b32_e32 v1, s4 3939; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen 3940; GFX90A-NEXT: s_and_b32 s4, s20, 3 3941; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 3942; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 3943; GFX90A-NEXT: s_not_b32 s7, s4 3944; GFX90A-NEXT: s_mov_b64 s[4:5], 0 3945; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start 3946; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3947; GFX90A-NEXT: s_waitcnt vmcnt(0) 3948; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 3949; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 3950; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 3951; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 3952; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 3953; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc 3954; GFX90A-NEXT: s_waitcnt vmcnt(0) 3955; GFX90A-NEXT: buffer_wbinvl1 3956; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 3957; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3958; GFX90A-NEXT: v_mov_b32_e32 v3, v4 3959; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 3960; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 3961; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 3962; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3963; GFX90A-NEXT: s_setpc_b64 s[30:31] 3964; 3965; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3966; GFX908: ; %bb.0: 3967; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3968; GFX908-NEXT: s_addk_i32 s20, 0x200 3969; GFX908-NEXT: s_and_b32 s4, s20, -4 3970; GFX908-NEXT: v_mov_b32_e32 v3, s4 3971; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen 3972; GFX908-NEXT: s_and_b32 s4, s20, 3 3973; GFX908-NEXT: s_lshl_b32 s6, s4, 3 3974; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 3975; GFX908-NEXT: s_not_b32 s7, s4 3976; GFX908-NEXT: s_mov_b64 s[4:5], 0 3977; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 3978; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3979; GFX908-NEXT: s_waitcnt vmcnt(0) 3980; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 3981; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 3982; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 3983; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 3984; GFX908-NEXT: v_mov_b32_e32 v5, v2 3985; GFX908-NEXT: v_mov_b32_e32 v4, v1 3986; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 3987; GFX908-NEXT: s_waitcnt vmcnt(0) 3988; GFX908-NEXT: buffer_wbinvl1 3989; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 3990; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3991; GFX908-NEXT: v_mov_b32_e32 v2, v4 3992; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3993; GFX908-NEXT: s_cbranch_execnz .LBB14_1 3994; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3995; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3996; GFX908-NEXT: s_setpc_b64 s[30:31] 3997; 3998; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 3999; GFX8: ; %bb.0: 4000; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4001; GFX8-NEXT: s_addk_i32 s20, 0x200 4002; GFX8-NEXT: s_and_b32 s4, s20, -4 4003; GFX8-NEXT: v_mov_b32_e32 v3, s4 4004; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen 4005; GFX8-NEXT: s_and_b32 s4, s20, 3 4006; GFX8-NEXT: s_lshl_b32 s6, s4, 3 4007; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 4008; GFX8-NEXT: s_not_b32 s7, s4 4009; GFX8-NEXT: s_mov_b64 s[4:5], 0 4010; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 4011; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4012; GFX8-NEXT: s_waitcnt vmcnt(0) 4013; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 4014; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 4015; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 4016; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 4017; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 4018; GFX8-NEXT: v_mov_b32_e32 v5, v2 4019; GFX8-NEXT: v_mov_b32_e32 v4, v1 4020; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 4021; GFX8-NEXT: s_waitcnt vmcnt(0) 4022; GFX8-NEXT: buffer_wbinvl1 4023; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 4024; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4025; GFX8-NEXT: v_mov_b32_e32 v2, v4 4026; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4027; GFX8-NEXT: s_cbranch_execnz .LBB14_1 4028; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4029; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4030; GFX8-NEXT: s_setpc_b64 s[30:31] 4031; 4032; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 4033; GFX7: ; %bb.0: 4034; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4035; GFX7-NEXT: s_addk_i32 s20, 0x200 4036; GFX7-NEXT: s_and_b32 s4, s20, -4 4037; GFX7-NEXT: v_mov_b32_e32 v2, s4 4038; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4039; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 4040; GFX7-NEXT: s_and_b32 s4, s20, 3 4041; GFX7-NEXT: s_lshl_b32 s6, s4, 3 4042; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 4043; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 4044; GFX7-NEXT: s_not_b32 s7, s4 4045; GFX7-NEXT: s_mov_b64 s[4:5], 0 4046; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start 4047; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4048; GFX7-NEXT: s_waitcnt vmcnt(0) 4049; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4050; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 4051; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 4052; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 4053; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 4054; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4055; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 4056; GFX7-NEXT: v_mov_b32_e32 v5, v1 4057; GFX7-NEXT: v_mov_b32_e32 v4, v0 4058; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4059; GFX7-NEXT: s_waitcnt vmcnt(0) 4060; GFX7-NEXT: buffer_wbinvl1 4061; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4062; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4063; GFX7-NEXT: v_mov_b32_e32 v1, v4 4064; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4065; GFX7-NEXT: s_cbranch_execnz .LBB14_1 4066; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4067; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4068; GFX7-NEXT: s_setpc_b64 s[30:31] 4069; 4070; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: 4071; GFX6: ; %bb.0: 4072; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4073; GFX6-NEXT: s_addk_i32 s20, 0x200 4074; GFX6-NEXT: s_and_b32 s4, s20, -4 4075; GFX6-NEXT: v_mov_b32_e32 v2, s4 4076; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4077; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 4078; GFX6-NEXT: s_and_b32 s4, s20, 3 4079; GFX6-NEXT: s_lshl_b32 s6, s4, 3 4080; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 4081; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 4082; GFX6-NEXT: s_not_b32 s7, s4 4083; GFX6-NEXT: s_mov_b64 s[4:5], 0 4084; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start 4085; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 4086; GFX6-NEXT: s_waitcnt vmcnt(0) 4087; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4088; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 4089; GFX6-NEXT: s_waitcnt expcnt(0) 4090; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 4091; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 4092; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 4093; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4094; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 4095; GFX6-NEXT: v_mov_b32_e32 v5, v1 4096; GFX6-NEXT: v_mov_b32_e32 v4, v0 4097; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4098; GFX6-NEXT: s_waitcnt vmcnt(0) 4099; GFX6-NEXT: buffer_wbinvl1 4100; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4101; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4102; GFX6-NEXT: v_mov_b32_e32 v1, v4 4103; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 4104; GFX6-NEXT: s_cbranch_execnz .LBB14_1 4105; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 4106; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 4107; GFX6-NEXT: s_waitcnt expcnt(0) 4108; GFX6-NEXT: s_setpc_b64 s[30:31] 4109 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 4110 %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4111 ret void 4112} 4113 4114define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { 4115; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4116; GFX12: ; %bb.0: 4117; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4118; GFX12-NEXT: s_wait_expcnt 0x0 4119; GFX12-NEXT: s_wait_samplecnt 0x0 4120; GFX12-NEXT: s_wait_bvhcnt 0x0 4121; GFX12-NEXT: s_wait_kmcnt 0x0 4122; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 4123; GFX12-NEXT: s_mov_b32 s1, exec_lo 4124; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4125; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 4126; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 4127; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4128; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4129; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff 4130; GFX12-NEXT: v_not_b32_e32 v11, v7 4131; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4132; GFX12-NEXT: v_readfirstlane_b32 s4, v0 4133; GFX12-NEXT: v_readfirstlane_b32 s5, v1 4134; GFX12-NEXT: v_readfirstlane_b32 s6, v2 4135; GFX12-NEXT: v_readfirstlane_b32 s7, v3 4136; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4137; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4138; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4139; GFX12-NEXT: s_wait_alu 0xfffe 4140; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4141; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 4142; GFX12-NEXT: s_wait_alu 0xfffe 4143; GFX12-NEXT: s_and_saveexec_b32 s0, s0 4144; GFX12-NEXT: s_wait_loadcnt 0x0 4145; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen 4146; GFX12-NEXT: s_wait_alu 0xfffe 4147; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4148; GFX12-NEXT: s_cbranch_execnz .LBB15_1 4149; GFX12-NEXT: ; %bb.2: 4150; GFX12-NEXT: s_mov_b32 exec_lo, s1 4151; GFX12-NEXT: s_mov_b32 s1, 0 4152; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start 4153; GFX12-NEXT: ; =>This Loop Header: Depth=1 4154; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 4155; GFX12-NEXT: s_wait_loadcnt 0x0 4156; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4157; GFX12-NEXT: s_mov_b32 s2, exec_lo 4158; GFX12-NEXT: s_wait_storecnt 0x0 4159; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4160; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 4161; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 4162; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4163; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4164; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 4165; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4166; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 4167; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4168; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 4169; GFX12-NEXT: v_readfirstlane_b32 s4, v0 4170; GFX12-NEXT: v_readfirstlane_b32 s5, v1 4171; GFX12-NEXT: v_readfirstlane_b32 s6, v2 4172; GFX12-NEXT: v_readfirstlane_b32 s7, v3 4173; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4174; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4175; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4176; GFX12-NEXT: s_wait_alu 0xfffe 4177; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4178; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 4179; GFX12-NEXT: s_wait_alu 0xfffe 4180; GFX12-NEXT: s_and_saveexec_b32 s0, s0 4181; GFX12-NEXT: s_wait_loadcnt 0x0 4182; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN 4183; GFX12-NEXT: s_wait_alu 0xfffe 4184; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4185; GFX12-NEXT: s_cbranch_execnz .LBB15_4 4186; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4187; GFX12-NEXT: s_mov_b32 exec_lo, s2 4188; GFX12-NEXT: s_wait_loadcnt 0x0 4189; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 4190; GFX12-NEXT: v_mov_b32_e32 v7, v8 4191; GFX12-NEXT: global_inv scope:SCOPE_DEV 4192; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4193; GFX12-NEXT: s_wait_alu 0xfffe 4194; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4195; GFX12-NEXT: s_cbranch_execnz .LBB15_3 4196; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 4197; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4198; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4199; GFX12-NEXT: s_wait_alu 0xfffe 4200; GFX12-NEXT: s_setpc_b64 s[30:31] 4201; 4202; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4203; GFX940: ; %bb.0: 4204; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4205; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 4206; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 4207; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 4208; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4209; GFX940-NEXT: s_mov_b32 s0, 0xffff 4210; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 4211; GFX940-NEXT: v_not_b32_e32 v11, v6 4212; GFX940-NEXT: s_mov_b64 s[2:3], exec 4213; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4214; GFX940-NEXT: v_readfirstlane_b32 s4, v0 4215; GFX940-NEXT: v_readfirstlane_b32 s5, v1 4216; GFX940-NEXT: v_readfirstlane_b32 s6, v2 4217; GFX940-NEXT: v_readfirstlane_b32 s7, v3 4218; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 4219; GFX940-NEXT: s_nop 0 4220; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 4221; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 4222; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 4223; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen 4224; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 4225; GFX940-NEXT: s_cbranch_execnz .LBB15_1 4226; GFX940-NEXT: ; %bb.2: 4227; GFX940-NEXT: s_mov_b64 exec, s[2:3] 4228; GFX940-NEXT: s_mov_b64 s[2:3], 0 4229; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start 4230; GFX940-NEXT: ; =>This Loop Header: Depth=1 4231; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 4232; GFX940-NEXT: s_waitcnt vmcnt(0) 4233; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4234; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 4235; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4236; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 4237; GFX940-NEXT: s_mov_b64 s[8:9], exec 4238; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] 4239; GFX940-NEXT: buffer_wbl2 sc1 4240; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4241; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 4242; GFX940-NEXT: v_readfirstlane_b32 s4, v0 4243; GFX940-NEXT: v_readfirstlane_b32 s5, v1 4244; GFX940-NEXT: v_readfirstlane_b32 s6, v2 4245; GFX940-NEXT: v_readfirstlane_b32 s7, v3 4246; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 4247; GFX940-NEXT: s_nop 0 4248; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 4249; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 4250; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 4251; GFX940-NEXT: s_waitcnt vmcnt(0) 4252; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 4253; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 4254; GFX940-NEXT: s_cbranch_execnz .LBB15_4 4255; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4256; GFX940-NEXT: s_mov_b64 exec, s[8:9] 4257; GFX940-NEXT: s_waitcnt vmcnt(0) 4258; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 4259; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 4260; GFX940-NEXT: v_mov_b32_e32 v7, v8 4261; GFX940-NEXT: buffer_inv sc1 4262; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 4263; GFX940-NEXT: s_cbranch_execnz .LBB15_3 4264; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 4265; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 4266; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4267; GFX940-NEXT: s_setpc_b64 s[30:31] 4268; 4269; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4270; GFX11: ; %bb.0: 4271; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4272; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 4273; GFX11-NEXT: s_mov_b32 s1, 0 4274; GFX11-NEXT: s_mov_b32 s2, exec_lo 4275; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4276; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 4277; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 4278; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4279; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4280; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff 4281; GFX11-NEXT: v_not_b32_e32 v11, v7 4282; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4283; GFX11-NEXT: v_readfirstlane_b32 s4, v0 4284; GFX11-NEXT: v_readfirstlane_b32 s5, v1 4285; GFX11-NEXT: v_readfirstlane_b32 s6, v2 4286; GFX11-NEXT: v_readfirstlane_b32 s7, v3 4287; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4288; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4289; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4290; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 4291; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 4292; GFX11-NEXT: s_and_saveexec_b32 s0, s0 4293; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen 4294; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4295; GFX11-NEXT: s_cbranch_execnz .LBB15_1 4296; GFX11-NEXT: ; %bb.2: 4297; GFX11-NEXT: s_mov_b32 exec_lo, s2 4298; GFX11-NEXT: .p2align 6 4299; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start 4300; GFX11-NEXT: ; =>This Loop Header: Depth=1 4301; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 4302; GFX11-NEXT: s_waitcnt vmcnt(0) 4303; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4304; GFX11-NEXT: s_mov_b32 s2, exec_lo 4305; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4306; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4307; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 4308; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 4309; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4310; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4311; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 4312; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4313; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 4314; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4315; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 4316; GFX11-NEXT: v_readfirstlane_b32 s4, v0 4317; GFX11-NEXT: v_readfirstlane_b32 s5, v1 4318; GFX11-NEXT: v_readfirstlane_b32 s6, v2 4319; GFX11-NEXT: v_readfirstlane_b32 s7, v3 4320; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4321; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4322; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4323; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 4324; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 4325; GFX11-NEXT: s_and_saveexec_b32 s0, s0 4326; GFX11-NEXT: s_waitcnt vmcnt(0) 4327; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc 4328; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4329; GFX11-NEXT: s_cbranch_execnz .LBB15_4 4330; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4331; GFX11-NEXT: s_mov_b32 exec_lo, s2 4332; GFX11-NEXT: s_waitcnt vmcnt(0) 4333; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 4334; GFX11-NEXT: v_mov_b32_e32 v7, v8 4335; GFX11-NEXT: buffer_gl1_inv 4336; GFX11-NEXT: buffer_gl0_inv 4337; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4338; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4339; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4340; GFX11-NEXT: s_cbranch_execnz .LBB15_3 4341; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 4342; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4343; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4344; GFX11-NEXT: s_setpc_b64 s[30:31] 4345; 4346; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4347; GFX10: ; %bb.0: 4348; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4349; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 4350; GFX10-NEXT: s_mov_b32 s5, 0 4351; GFX10-NEXT: s_mov_b32 s6, exec_lo 4352; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 4353; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 4354; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4355; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff 4356; GFX10-NEXT: v_not_b32_e32 v11, v7 4357; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4358; GFX10-NEXT: v_readfirstlane_b32 s8, v0 4359; GFX10-NEXT: v_readfirstlane_b32 s9, v1 4360; GFX10-NEXT: v_readfirstlane_b32 s10, v2 4361; GFX10-NEXT: v_readfirstlane_b32 s11, v3 4362; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 4363; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 4364; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 4365; GFX10-NEXT: s_and_saveexec_b32 s4, s4 4366; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen 4367; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4368; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 4369; GFX10-NEXT: s_cbranch_execnz .LBB15_1 4370; GFX10-NEXT: ; %bb.2: 4371; GFX10-NEXT: s_mov_b32 exec_lo, s6 4372; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start 4373; GFX10-NEXT: ; =>This Loop Header: Depth=1 4374; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 4375; GFX10-NEXT: s_waitcnt vmcnt(0) 4376; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4377; GFX10-NEXT: s_mov_b32 s6, exec_lo 4378; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4379; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 4380; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 4381; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 4382; GFX10-NEXT: v_mov_b32_e32 v9, v7 4383; GFX10-NEXT: v_mov_b32_e32 v8, v6 4384; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4385; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 4386; GFX10-NEXT: v_readfirstlane_b32 s8, v0 4387; GFX10-NEXT: v_readfirstlane_b32 s9, v1 4388; GFX10-NEXT: v_readfirstlane_b32 s10, v2 4389; GFX10-NEXT: v_readfirstlane_b32 s11, v3 4390; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 4391; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 4392; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 4393; GFX10-NEXT: s_and_saveexec_b32 s4, s4 4394; GFX10-NEXT: s_waitcnt vmcnt(0) 4395; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc 4396; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4397; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 4398; GFX10-NEXT: s_cbranch_execnz .LBB15_4 4399; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4400; GFX10-NEXT: s_mov_b32 exec_lo, s6 4401; GFX10-NEXT: s_waitcnt vmcnt(0) 4402; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 4403; GFX10-NEXT: v_mov_b32_e32 v7, v8 4404; GFX10-NEXT: buffer_gl1_inv 4405; GFX10-NEXT: buffer_gl0_inv 4406; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4407; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4408; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4409; GFX10-NEXT: s_cbranch_execnz .LBB15_3 4410; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 4411; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4412; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4413; GFX10-NEXT: s_setpc_b64 s[30:31] 4414; 4415; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4416; GFX90A: ; %bb.0: 4417; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4418; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 4419; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 4420; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 4421; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4422; GFX90A-NEXT: s_mov_b32 s4, 0xffff 4423; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 4424; GFX90A-NEXT: v_not_b32_e32 v11, v6 4425; GFX90A-NEXT: s_mov_b64 s[6:7], exec 4426; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4427; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 4428; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 4429; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 4430; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 4431; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4432; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4433; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4434; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4435; GFX90A-NEXT: s_nop 0 4436; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen 4437; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 4438; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 4439; GFX90A-NEXT: ; %bb.2: 4440; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 4441; GFX90A-NEXT: s_mov_b64 s[6:7], 0 4442; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start 4443; GFX90A-NEXT: ; =>This Loop Header: Depth=1 4444; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 4445; GFX90A-NEXT: s_waitcnt vmcnt(0) 4446; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4447; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 4448; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4449; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 4450; GFX90A-NEXT: s_mov_b64 s[12:13], exec 4451; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] 4452; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4453; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 4454; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 4455; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 4456; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 4457; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 4458; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4459; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4460; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4461; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4462; GFX90A-NEXT: s_waitcnt vmcnt(0) 4463; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc 4464; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 4465; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 4466; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4467; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 4468; GFX90A-NEXT: s_waitcnt vmcnt(0) 4469; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 4470; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4471; GFX90A-NEXT: v_mov_b32_e32 v7, v8 4472; GFX90A-NEXT: buffer_wbinvl1 4473; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 4474; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 4475; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 4476; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 4477; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4478; GFX90A-NEXT: s_setpc_b64 s[30:31] 4479; 4480; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4481; GFX908: ; %bb.0: 4482; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4483; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 4484; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 4485; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 4486; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4487; GFX908-NEXT: s_mov_b32 s4, 0xffff 4488; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 4489; GFX908-NEXT: v_not_b32_e32 v11, v6 4490; GFX908-NEXT: s_mov_b64 s[6:7], exec 4491; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4492; GFX908-NEXT: v_readfirstlane_b32 s8, v0 4493; GFX908-NEXT: v_readfirstlane_b32 s9, v1 4494; GFX908-NEXT: v_readfirstlane_b32 s10, v2 4495; GFX908-NEXT: v_readfirstlane_b32 s11, v3 4496; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4497; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4498; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4499; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4500; GFX908-NEXT: s_nop 0 4501; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen 4502; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 4503; GFX908-NEXT: s_cbranch_execnz .LBB15_1 4504; GFX908-NEXT: ; %bb.2: 4505; GFX908-NEXT: s_mov_b64 exec, s[6:7] 4506; GFX908-NEXT: s_mov_b64 s[6:7], 0 4507; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start 4508; GFX908-NEXT: ; =>This Loop Header: Depth=1 4509; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 4510; GFX908-NEXT: s_waitcnt vmcnt(0) 4511; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4512; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 4513; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4514; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 4515; GFX908-NEXT: v_mov_b32_e32 v9, v7 4516; GFX908-NEXT: s_mov_b64 s[12:13], exec 4517; GFX908-NEXT: v_mov_b32_e32 v8, v6 4518; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4519; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 4520; GFX908-NEXT: v_readfirstlane_b32 s8, v0 4521; GFX908-NEXT: v_readfirstlane_b32 s9, v1 4522; GFX908-NEXT: v_readfirstlane_b32 s10, v2 4523; GFX908-NEXT: v_readfirstlane_b32 s11, v3 4524; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4525; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4526; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4527; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4528; GFX908-NEXT: s_waitcnt vmcnt(0) 4529; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc 4530; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 4531; GFX908-NEXT: s_cbranch_execnz .LBB15_4 4532; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4533; GFX908-NEXT: s_mov_b64 exec, s[12:13] 4534; GFX908-NEXT: s_waitcnt vmcnt(0) 4535; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 4536; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4537; GFX908-NEXT: v_mov_b32_e32 v7, v8 4538; GFX908-NEXT: buffer_wbinvl1 4539; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 4540; GFX908-NEXT: s_cbranch_execnz .LBB15_3 4541; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 4542; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4543; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4544; GFX908-NEXT: s_setpc_b64 s[30:31] 4545; 4546; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4547; GFX8: ; %bb.0: 4548; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4549; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 4550; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 4551; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 4552; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 4553; GFX8-NEXT: s_mov_b32 s4, 0xffff 4554; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 4555; GFX8-NEXT: v_not_b32_e32 v11, v6 4556; GFX8-NEXT: s_mov_b64 s[6:7], exec 4557; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4558; GFX8-NEXT: v_readfirstlane_b32 s8, v0 4559; GFX8-NEXT: v_readfirstlane_b32 s9, v1 4560; GFX8-NEXT: v_readfirstlane_b32 s10, v2 4561; GFX8-NEXT: v_readfirstlane_b32 s11, v3 4562; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4563; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4564; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4565; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4566; GFX8-NEXT: s_nop 0 4567; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen 4568; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 4569; GFX8-NEXT: s_cbranch_execnz .LBB15_1 4570; GFX8-NEXT: ; %bb.2: 4571; GFX8-NEXT: s_mov_b64 exec, s[6:7] 4572; GFX8-NEXT: s_mov_b64 s[6:7], 0 4573; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start 4574; GFX8-NEXT: ; =>This Loop Header: Depth=1 4575; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 4576; GFX8-NEXT: s_waitcnt vmcnt(0) 4577; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 4578; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 4579; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 4580; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 4581; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 4582; GFX8-NEXT: v_mov_b32_e32 v9, v7 4583; GFX8-NEXT: s_mov_b64 s[12:13], exec 4584; GFX8-NEXT: v_mov_b32_e32 v8, v6 4585; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4586; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 4587; GFX8-NEXT: v_readfirstlane_b32 s8, v0 4588; GFX8-NEXT: v_readfirstlane_b32 s9, v1 4589; GFX8-NEXT: v_readfirstlane_b32 s10, v2 4590; GFX8-NEXT: v_readfirstlane_b32 s11, v3 4591; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4592; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4593; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4594; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4595; GFX8-NEXT: s_waitcnt vmcnt(0) 4596; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc 4597; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 4598; GFX8-NEXT: s_cbranch_execnz .LBB15_4 4599; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4600; GFX8-NEXT: s_mov_b64 exec, s[12:13] 4601; GFX8-NEXT: s_waitcnt vmcnt(0) 4602; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 4603; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4604; GFX8-NEXT: v_mov_b32_e32 v7, v8 4605; GFX8-NEXT: buffer_wbinvl1 4606; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 4607; GFX8-NEXT: s_cbranch_execnz .LBB15_3 4608; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 4609; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4610; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 4611; GFX8-NEXT: s_setpc_b64 s[30:31] 4612; 4613; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4614; GFX7: ; %bb.0: 4615; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4616; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 4617; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 4618; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 4619; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 4620; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 4621; GFX7-NEXT: v_not_b32_e32 v9, v4 4622; GFX7-NEXT: s_mov_b64 s[6:7], exec 4623; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4624; GFX7-NEXT: v_readfirstlane_b32 s8, v0 4625; GFX7-NEXT: v_readfirstlane_b32 s9, v1 4626; GFX7-NEXT: v_readfirstlane_b32 s10, v2 4627; GFX7-NEXT: v_readfirstlane_b32 s11, v3 4628; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4629; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4630; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4631; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4632; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 4633; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 4634; GFX7-NEXT: s_cbranch_execnz .LBB15_1 4635; GFX7-NEXT: ; %bb.2: 4636; GFX7-NEXT: s_mov_b64 exec, s[6:7] 4637; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 4638; GFX7-NEXT: s_mov_b64 s[6:7], 0 4639; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 4640; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start 4641; GFX7-NEXT: ; =>This Loop Header: Depth=1 4642; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 4643; GFX7-NEXT: s_waitcnt vmcnt(0) 4644; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 4645; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 4646; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 4647; GFX7-NEXT: s_mov_b64 s[12:13], exec 4648; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 4649; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 4650; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 4651; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 4652; GFX7-NEXT: v_mov_b32_e32 v4, v5 4653; GFX7-NEXT: v_mov_b32_e32 v5, v6 4654; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4655; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 4656; GFX7-NEXT: v_readfirstlane_b32 s8, v0 4657; GFX7-NEXT: v_readfirstlane_b32 s9, v1 4658; GFX7-NEXT: v_readfirstlane_b32 s10, v2 4659; GFX7-NEXT: v_readfirstlane_b32 s11, v3 4660; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4661; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4662; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4663; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4664; GFX7-NEXT: s_waitcnt vmcnt(0) 4665; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 4666; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 4667; GFX7-NEXT: s_cbranch_execnz .LBB15_4 4668; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4669; GFX7-NEXT: s_mov_b64 exec, s[12:13] 4670; GFX7-NEXT: s_waitcnt vmcnt(0) 4671; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 4672; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4673; GFX7-NEXT: v_mov_b32_e32 v6, v4 4674; GFX7-NEXT: buffer_wbinvl1 4675; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 4676; GFX7-NEXT: s_cbranch_execnz .LBB15_3 4677; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 4678; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 4679; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 4680; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 4681; GFX7-NEXT: s_setpc_b64 s[30:31] 4682; 4683; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 4684; GFX6: ; %bb.0: 4685; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4686; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 4687; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 4688; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 4689; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 4690; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 4691; GFX6-NEXT: v_not_b32_e32 v9, v4 4692; GFX6-NEXT: s_mov_b64 s[6:7], exec 4693; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4694; GFX6-NEXT: v_readfirstlane_b32 s8, v0 4695; GFX6-NEXT: v_readfirstlane_b32 s9, v1 4696; GFX6-NEXT: v_readfirstlane_b32 s10, v2 4697; GFX6-NEXT: v_readfirstlane_b32 s11, v3 4698; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4699; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4700; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4701; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4702; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 4703; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 4704; GFX6-NEXT: s_cbranch_execnz .LBB15_1 4705; GFX6-NEXT: ; %bb.2: 4706; GFX6-NEXT: s_mov_b64 exec, s[6:7] 4707; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 4708; GFX6-NEXT: s_mov_b64 s[6:7], 0 4709; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 4710; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start 4711; GFX6-NEXT: ; =>This Loop Header: Depth=1 4712; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 4713; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4714; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 4715; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 4716; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 4717; GFX6-NEXT: s_mov_b64 s[12:13], exec 4718; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 4719; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 4720; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 4721; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 4722; GFX6-NEXT: v_mov_b32_e32 v4, v5 4723; GFX6-NEXT: v_mov_b32_e32 v5, v6 4724; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4725; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 4726; GFX6-NEXT: v_readfirstlane_b32 s8, v0 4727; GFX6-NEXT: v_readfirstlane_b32 s9, v1 4728; GFX6-NEXT: v_readfirstlane_b32 s10, v2 4729; GFX6-NEXT: v_readfirstlane_b32 s11, v3 4730; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4731; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 4732; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 4733; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 4734; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4735; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 4736; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 4737; GFX6-NEXT: s_cbranch_execnz .LBB15_4 4738; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4739; GFX6-NEXT: s_mov_b64 exec, s[12:13] 4740; GFX6-NEXT: s_waitcnt vmcnt(0) 4741; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 4742; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4743; GFX6-NEXT: v_mov_b32_e32 v6, v4 4744; GFX6-NEXT: buffer_wbinvl1 4745; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 4746; GFX6-NEXT: s_cbranch_execnz .LBB15_3 4747; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 4748; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 4749; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 4750; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 4751; GFX6-NEXT: s_waitcnt expcnt(0) 4752; GFX6-NEXT: s_setpc_b64 s[30:31] 4753 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 4754 %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4755 ret half %result 4756} 4757 4758; -------------------------------------------------------------------- 4759; bfloat 4760; -------------------------------------------------------------------- 4761 4762define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { 4763; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4764; GFX12: ; %bb.0: 4765; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4766; GFX12-NEXT: s_wait_expcnt 0x0 4767; GFX12-NEXT: s_wait_samplecnt 0x0 4768; GFX12-NEXT: s_wait_bvhcnt 0x0 4769; GFX12-NEXT: s_wait_kmcnt 0x0 4770; GFX12-NEXT: s_addk_co_i32 s16, 0x200 4771; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4772; GFX12-NEXT: s_wait_alu 0xfffe 4773; GFX12-NEXT: s_and_b32 s4, s16, -4 4774; GFX12-NEXT: s_wait_alu 0xfffe 4775; GFX12-NEXT: v_mov_b32_e32 v4, s4 4776; GFX12-NEXT: s_and_b32 s4, s16, 3 4777; GFX12-NEXT: s_wait_alu 0xfffe 4778; GFX12-NEXT: s_lshl_b32 s4, s4, 3 4779; GFX12-NEXT: s_wait_alu 0xfffe 4780; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 4781; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen 4782; GFX12-NEXT: s_wait_alu 0xfffe 4783; GFX12-NEXT: s_not_b32 s6, s5 4784; GFX12-NEXT: s_mov_b32 s5, 0 4785; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start 4786; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4787; GFX12-NEXT: s_wait_loadcnt 0x0 4788; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 4789; GFX12-NEXT: s_wait_storecnt 0x0 4790; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4791; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4792; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 4793; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4794; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 4795; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 4796; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4797; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 4798; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4799; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 4800; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4801; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 4802; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 4803; GFX12-NEXT: s_wait_alu 0xfffe 4804; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 4805; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4806; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4807; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN 4808; GFX12-NEXT: s_wait_loadcnt 0x0 4809; GFX12-NEXT: global_inv scope:SCOPE_DEV 4810; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 4811; GFX12-NEXT: v_mov_b32_e32 v1, v2 4812; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 4813; GFX12-NEXT: s_wait_alu 0xfffe 4814; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 4815; GFX12-NEXT: s_cbranch_execnz .LBB16_1 4816; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 4817; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 4818; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 4819; GFX12-NEXT: s_wait_alu 0xfffe 4820; GFX12-NEXT: s_setpc_b64 s[30:31] 4821; 4822; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4823; GFX940: ; %bb.0: 4824; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4825; GFX940-NEXT: s_addk_i32 s16, 0x200 4826; GFX940-NEXT: s_and_b32 s4, s16, -4 4827; GFX940-NEXT: v_mov_b32_e32 v4, s4 4828; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen 4829; GFX940-NEXT: s_and_b32 s4, s16, 3 4830; GFX940-NEXT: s_lshl_b32 s6, s4, 3 4831; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 4832; GFX940-NEXT: s_not_b32 s7, s4 4833; GFX940-NEXT: s_mov_b64 s[4:5], 0 4834; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4835; GFX940-NEXT: s_movk_i32 s8, 0x7fff 4836; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start 4837; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 4838; GFX940-NEXT: s_waitcnt vmcnt(0) 4839; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4840; GFX940-NEXT: buffer_wbl2 sc1 4841; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 4842; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 4843; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 4844; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 4845; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4846; GFX940-NEXT: s_nop 1 4847; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 4848; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4849; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 4850; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] 4851; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 4852; GFX940-NEXT: s_waitcnt vmcnt(0) 4853; GFX940-NEXT: buffer_inv sc1 4854; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4855; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4856; GFX940-NEXT: v_mov_b32_e32 v1, v2 4857; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 4858; GFX940-NEXT: s_cbranch_execnz .LBB16_1 4859; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 4860; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 4861; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4862; GFX940-NEXT: s_setpc_b64 s[30:31] 4863; 4864; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4865; GFX11: ; %bb.0: 4866; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4867; GFX11-NEXT: s_addk_i32 s16, 0x200 4868; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4869; GFX11-NEXT: s_and_b32 s4, s16, -4 4870; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4871; GFX11-NEXT: v_mov_b32_e32 v4, s4 4872; GFX11-NEXT: s_and_b32 s4, s16, 3 4873; GFX11-NEXT: s_lshl_b32 s4, s4, 3 4874; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4875; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 4876; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen 4877; GFX11-NEXT: s_not_b32 s6, s5 4878; GFX11-NEXT: s_mov_b32 s5, 0 4879; GFX11-NEXT: .p2align 6 4880; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start 4881; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4882; GFX11-NEXT: s_waitcnt vmcnt(0) 4883; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 4884; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4885; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4886; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4887; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 4888; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4889; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 4890; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 4891; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4892; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 4893; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4894; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 4895; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4896; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4897; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 4898; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 4899; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4900; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4901; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc 4902; GFX11-NEXT: s_waitcnt vmcnt(0) 4903; GFX11-NEXT: buffer_gl1_inv 4904; GFX11-NEXT: buffer_gl0_inv 4905; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 4906; GFX11-NEXT: v_mov_b32_e32 v1, v2 4907; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 4908; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4909; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 4910; GFX11-NEXT: s_cbranch_execnz .LBB16_1 4911; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4912; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 4913; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 4914; GFX11-NEXT: s_setpc_b64 s[30:31] 4915; 4916; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4917; GFX10: ; %bb.0: 4918; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4919; GFX10-NEXT: s_addk_i32 s20, 0x200 4920; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4921; GFX10-NEXT: s_and_b32 s4, s20, -4 4922; GFX10-NEXT: v_mov_b32_e32 v4, s4 4923; GFX10-NEXT: s_and_b32 s4, s20, 3 4924; GFX10-NEXT: s_lshl_b32 s4, s4, 3 4925; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 4926; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4927; GFX10-NEXT: s_not_b32 s6, s5 4928; GFX10-NEXT: s_mov_b32 s5, 0 4929; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start 4930; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4931; GFX10-NEXT: s_waitcnt vmcnt(0) 4932; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4933; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4934; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 4935; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 4936; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 4937; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4938; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 4939; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 4940; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4941; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 4942; GFX10-NEXT: v_mov_b32_e32 v3, v1 4943; GFX10-NEXT: v_mov_b32_e32 v2, v0 4944; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4945; GFX10-NEXT: s_waitcnt vmcnt(0) 4946; GFX10-NEXT: buffer_gl1_inv 4947; GFX10-NEXT: buffer_gl0_inv 4948; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 4949; GFX10-NEXT: v_mov_b32_e32 v1, v2 4950; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4951; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4952; GFX10-NEXT: s_cbranch_execnz .LBB16_1 4953; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4954; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4955; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 4956; GFX10-NEXT: s_setpc_b64 s[30:31] 4957; 4958; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4959; GFX90A: ; %bb.0: 4960; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4961; GFX90A-NEXT: s_addk_i32 s20, 0x200 4962; GFX90A-NEXT: s_and_b32 s4, s20, -4 4963; GFX90A-NEXT: v_mov_b32_e32 v4, s4 4964; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4965; GFX90A-NEXT: s_and_b32 s4, s20, 3 4966; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 4967; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 4968; GFX90A-NEXT: s_not_b32 s7, s4 4969; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4970; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4971; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 4972; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start 4973; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4974; GFX90A-NEXT: s_waitcnt vmcnt(0) 4975; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4976; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 4977; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 4978; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 4979; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 4980; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4981; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 4982; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4983; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 4984; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] 4985; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4986; GFX90A-NEXT: s_waitcnt vmcnt(0) 4987; GFX90A-NEXT: buffer_wbinvl1 4988; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4989; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4990; GFX90A-NEXT: v_mov_b32_e32 v1, v2 4991; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4992; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 4993; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4994; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4995; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4996; GFX90A-NEXT: s_setpc_b64 s[30:31] 4997; 4998; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4999; GFX908: ; %bb.0: 5000; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5001; GFX908-NEXT: s_addk_i32 s20, 0x200 5002; GFX908-NEXT: s_and_b32 s4, s20, -4 5003; GFX908-NEXT: v_mov_b32_e32 v4, s4 5004; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 5005; GFX908-NEXT: s_and_b32 s4, s20, 3 5006; GFX908-NEXT: s_lshl_b32 s6, s4, 3 5007; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 5008; GFX908-NEXT: s_not_b32 s7, s4 5009; GFX908-NEXT: s_mov_b64 s[4:5], 0 5010; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 5011; GFX908-NEXT: s_movk_i32 s8, 0x7fff 5012; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start 5013; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5014; GFX908-NEXT: s_waitcnt vmcnt(0) 5015; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5016; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 5017; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 5018; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 5019; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 5020; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 5021; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 5022; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5023; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 5024; GFX908-NEXT: v_mov_b32_e32 v3, v1 5025; GFX908-NEXT: v_mov_b32_e32 v2, v0 5026; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 5027; GFX908-NEXT: s_waitcnt vmcnt(0) 5028; GFX908-NEXT: buffer_wbinvl1 5029; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5030; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5031; GFX908-NEXT: v_mov_b32_e32 v1, v2 5032; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5033; GFX908-NEXT: s_cbranch_execnz .LBB16_1 5034; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5035; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5036; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 5037; GFX908-NEXT: s_setpc_b64 s[30:31] 5038; 5039; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 5040; GFX8: ; %bb.0: 5041; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5042; GFX8-NEXT: s_addk_i32 s20, 0x200 5043; GFX8-NEXT: s_and_b32 s4, s20, -4 5044; GFX8-NEXT: v_mov_b32_e32 v4, s4 5045; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 5046; GFX8-NEXT: s_and_b32 s4, s20, 3 5047; GFX8-NEXT: s_lshl_b32 s6, s4, 3 5048; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 5049; GFX8-NEXT: s_not_b32 s7, s4 5050; GFX8-NEXT: s_mov_b64 s[4:5], 0 5051; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 5052; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start 5053; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5054; GFX8-NEXT: v_mov_b32_e32 v0, s6 5055; GFX8-NEXT: s_waitcnt vmcnt(0) 5056; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5057; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 5058; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 5059; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 5060; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 5061; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 5062; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 5063; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 5064; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 5065; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5066; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 5067; GFX8-NEXT: v_mov_b32_e32 v3, v1 5068; GFX8-NEXT: v_mov_b32_e32 v2, v0 5069; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 5070; GFX8-NEXT: s_waitcnt vmcnt(0) 5071; GFX8-NEXT: buffer_wbinvl1 5072; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5073; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5074; GFX8-NEXT: v_mov_b32_e32 v1, v2 5075; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5076; GFX8-NEXT: s_cbranch_execnz .LBB16_1 5077; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5078; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5079; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 5080; GFX8-NEXT: s_setpc_b64 s[30:31] 5081; 5082; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 5083; GFX7: ; %bb.0: 5084; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5085; GFX7-NEXT: s_addk_i32 s20, 0x200 5086; GFX7-NEXT: s_and_b32 s4, s20, -4 5087; GFX7-NEXT: v_mov_b32_e32 v4, s4 5088; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 5089; GFX7-NEXT: s_and_b32 s4, s20, 3 5090; GFX7-NEXT: s_lshl_b32 s6, s4, 3 5091; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 5092; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 5093; GFX7-NEXT: s_not_b32 s7, s4 5094; GFX7-NEXT: s_mov_b64 s[4:5], 0 5095; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 5096; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start 5097; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5098; GFX7-NEXT: s_waitcnt vmcnt(0) 5099; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 5100; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5101; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 5102; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5103; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 5104; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 5105; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 5106; GFX7-NEXT: v_mov_b32_e32 v3, v1 5107; GFX7-NEXT: v_mov_b32_e32 v2, v0 5108; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 5109; GFX7-NEXT: s_waitcnt vmcnt(0) 5110; GFX7-NEXT: buffer_wbinvl1 5111; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5112; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5113; GFX7-NEXT: v_mov_b32_e32 v1, v2 5114; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5115; GFX7-NEXT: s_cbranch_execnz .LBB16_1 5116; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5117; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5118; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 5119; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5120; GFX7-NEXT: s_setpc_b64 s[30:31] 5121; 5122; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: 5123; GFX6: ; %bb.0: 5124; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5125; GFX6-NEXT: s_addk_i32 s20, 0x200 5126; GFX6-NEXT: s_and_b32 s4, s20, -4 5127; GFX6-NEXT: v_mov_b32_e32 v4, s4 5128; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 5129; GFX6-NEXT: s_and_b32 s4, s20, 3 5130; GFX6-NEXT: s_lshl_b32 s6, s4, 3 5131; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 5132; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 5133; GFX6-NEXT: s_not_b32 s7, s4 5134; GFX6-NEXT: s_mov_b64 s[4:5], 0 5135; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 5136; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start 5137; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 5138; GFX6-NEXT: s_waitcnt vmcnt(0) 5139; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 5140; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5141; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 5142; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5143; GFX6-NEXT: s_waitcnt expcnt(0) 5144; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 5145; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 5146; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5147; GFX6-NEXT: v_mov_b32_e32 v3, v1 5148; GFX6-NEXT: v_mov_b32_e32 v2, v0 5149; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 5150; GFX6-NEXT: s_waitcnt vmcnt(0) 5151; GFX6-NEXT: buffer_wbinvl1 5152; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 5153; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5154; GFX6-NEXT: v_mov_b32_e32 v1, v2 5155; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 5156; GFX6-NEXT: s_cbranch_execnz .LBB16_1 5157; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 5158; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 5159; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 5160; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5161; GFX6-NEXT: s_waitcnt expcnt(0) 5162; GFX6-NEXT: s_setpc_b64 s[30:31] 5163 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 5164 %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 5165 ret bfloat %result 5166} 5167 5168define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { 5169; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5170; GFX12: ; %bb.0: 5171; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5172; GFX12-NEXT: s_wait_expcnt 0x0 5173; GFX12-NEXT: s_wait_samplecnt 0x0 5174; GFX12-NEXT: s_wait_bvhcnt 0x0 5175; GFX12-NEXT: s_wait_kmcnt 0x0 5176; GFX12-NEXT: s_addk_co_i32 s16, 0x200 5177; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5178; GFX12-NEXT: s_wait_alu 0xfffe 5179; GFX12-NEXT: s_and_b32 s4, s16, -4 5180; GFX12-NEXT: s_wait_alu 0xfffe 5181; GFX12-NEXT: v_mov_b32_e32 v2, s4 5182; GFX12-NEXT: s_and_b32 s4, s16, 3 5183; GFX12-NEXT: s_wait_alu 0xfffe 5184; GFX12-NEXT: s_lshl_b32 s4, s4, 3 5185; GFX12-NEXT: s_wait_alu 0xfffe 5186; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 5187; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen 5188; GFX12-NEXT: s_wait_alu 0xfffe 5189; GFX12-NEXT: s_not_b32 s6, s5 5190; GFX12-NEXT: s_mov_b32 s5, 0 5191; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start 5192; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5193; GFX12-NEXT: s_wait_loadcnt 0x0 5194; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 5195; GFX12-NEXT: s_wait_storecnt 0x0 5196; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5197; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5198; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 5199; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 5200; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 5201; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 5202; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5203; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 5204; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5205; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 5206; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5207; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 5208; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 5209; GFX12-NEXT: s_wait_alu 0xfffe 5210; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 5211; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5212; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 5213; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN 5214; GFX12-NEXT: s_wait_loadcnt 0x0 5215; GFX12-NEXT: global_inv scope:SCOPE_DEV 5216; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5217; GFX12-NEXT: v_mov_b32_e32 v1, v4 5218; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 5219; GFX12-NEXT: s_wait_alu 0xfffe 5220; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 5221; GFX12-NEXT: s_cbranch_execnz .LBB17_1 5222; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 5223; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 5224; GFX12-NEXT: s_wait_alu 0xfffe 5225; GFX12-NEXT: s_setpc_b64 s[30:31] 5226; 5227; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5228; GFX940: ; %bb.0: 5229; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5230; GFX940-NEXT: s_addk_i32 s16, 0x200 5231; GFX940-NEXT: s_and_b32 s4, s16, -4 5232; GFX940-NEXT: v_mov_b32_e32 v2, s4 5233; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen 5234; GFX940-NEXT: s_and_b32 s4, s16, 3 5235; GFX940-NEXT: s_lshl_b32 s6, s4, 3 5236; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 5237; GFX940-NEXT: s_not_b32 s7, s4 5238; GFX940-NEXT: s_mov_b64 s[4:5], 0 5239; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5240; GFX940-NEXT: s_movk_i32 s8, 0x7fff 5241; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start 5242; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5243; GFX940-NEXT: s_waitcnt vmcnt(0) 5244; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5245; GFX940-NEXT: buffer_wbl2 sc1 5246; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 5247; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 5248; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 5249; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 5250; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 5251; GFX940-NEXT: s_nop 1 5252; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 5253; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5254; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 5255; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 5256; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 5257; GFX940-NEXT: s_waitcnt vmcnt(0) 5258; GFX940-NEXT: buffer_inv sc1 5259; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5260; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5261; GFX940-NEXT: v_mov_b32_e32 v1, v4 5262; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 5263; GFX940-NEXT: s_cbranch_execnz .LBB17_1 5264; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 5265; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 5266; GFX940-NEXT: s_setpc_b64 s[30:31] 5267; 5268; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5269; GFX11: ; %bb.0: 5270; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5271; GFX11-NEXT: s_addk_i32 s16, 0x200 5272; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5273; GFX11-NEXT: s_and_b32 s4, s16, -4 5274; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5275; GFX11-NEXT: v_mov_b32_e32 v2, s4 5276; GFX11-NEXT: s_and_b32 s4, s16, 3 5277; GFX11-NEXT: s_lshl_b32 s4, s4, 3 5278; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5279; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 5280; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen 5281; GFX11-NEXT: s_not_b32 s6, s5 5282; GFX11-NEXT: s_mov_b32 s5, 0 5283; GFX11-NEXT: .p2align 6 5284; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start 5285; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5286; GFX11-NEXT: s_waitcnt vmcnt(0) 5287; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 5288; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5289; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5290; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5291; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 5292; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 5293; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 5294; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 5295; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5296; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 5297; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5298; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 5299; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5300; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5301; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 5302; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 5303; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5304; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 5305; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc 5306; GFX11-NEXT: s_waitcnt vmcnt(0) 5307; GFX11-NEXT: buffer_gl1_inv 5308; GFX11-NEXT: buffer_gl0_inv 5309; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5310; GFX11-NEXT: v_mov_b32_e32 v1, v4 5311; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 5312; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5313; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 5314; GFX11-NEXT: s_cbranch_execnz .LBB17_1 5315; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 5316; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 5317; GFX11-NEXT: s_setpc_b64 s[30:31] 5318; 5319; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5320; GFX10: ; %bb.0: 5321; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5322; GFX10-NEXT: s_addk_i32 s20, 0x200 5323; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5324; GFX10-NEXT: s_and_b32 s4, s20, -4 5325; GFX10-NEXT: v_mov_b32_e32 v2, s4 5326; GFX10-NEXT: s_and_b32 s4, s20, 3 5327; GFX10-NEXT: s_lshl_b32 s4, s4, 3 5328; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 5329; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5330; GFX10-NEXT: s_not_b32 s6, s5 5331; GFX10-NEXT: s_mov_b32 s5, 0 5332; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start 5333; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5334; GFX10-NEXT: s_waitcnt vmcnt(0) 5335; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5336; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5337; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 5338; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 5339; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 5340; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5341; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 5342; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 5343; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5344; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 5345; GFX10-NEXT: v_mov_b32_e32 v5, v1 5346; GFX10-NEXT: v_mov_b32_e32 v4, v0 5347; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5348; GFX10-NEXT: s_waitcnt vmcnt(0) 5349; GFX10-NEXT: buffer_gl1_inv 5350; GFX10-NEXT: buffer_gl0_inv 5351; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5352; GFX10-NEXT: v_mov_b32_e32 v1, v4 5353; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5354; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5355; GFX10-NEXT: s_cbranch_execnz .LBB17_1 5356; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5357; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5358; GFX10-NEXT: s_setpc_b64 s[30:31] 5359; 5360; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5361; GFX90A: ; %bb.0: 5362; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5363; GFX90A-NEXT: s_addk_i32 s20, 0x200 5364; GFX90A-NEXT: s_and_b32 s4, s20, -4 5365; GFX90A-NEXT: v_mov_b32_e32 v2, s4 5366; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5367; GFX90A-NEXT: s_and_b32 s4, s20, 3 5368; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 5369; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 5370; GFX90A-NEXT: s_not_b32 s7, s4 5371; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5372; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5373; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 5374; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start 5375; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5376; GFX90A-NEXT: s_waitcnt vmcnt(0) 5377; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5378; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 5379; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 5380; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 5381; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 5382; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 5383; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 5384; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5385; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 5386; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 5387; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5388; GFX90A-NEXT: s_waitcnt vmcnt(0) 5389; GFX90A-NEXT: buffer_wbinvl1 5390; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5391; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5392; GFX90A-NEXT: v_mov_b32_e32 v1, v4 5393; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 5394; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 5395; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 5396; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5397; GFX90A-NEXT: s_setpc_b64 s[30:31] 5398; 5399; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5400; GFX908: ; %bb.0: 5401; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5402; GFX908-NEXT: s_addk_i32 s20, 0x200 5403; GFX908-NEXT: s_and_b32 s4, s20, -4 5404; GFX908-NEXT: v_mov_b32_e32 v2, s4 5405; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5406; GFX908-NEXT: s_and_b32 s4, s20, 3 5407; GFX908-NEXT: s_lshl_b32 s6, s4, 3 5408; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 5409; GFX908-NEXT: s_not_b32 s7, s4 5410; GFX908-NEXT: s_mov_b64 s[4:5], 0 5411; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5412; GFX908-NEXT: s_movk_i32 s8, 0x7fff 5413; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start 5414; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5415; GFX908-NEXT: s_waitcnt vmcnt(0) 5416; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5417; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 5418; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 5419; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 5420; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 5421; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 5422; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 5423; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5424; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 5425; GFX908-NEXT: v_mov_b32_e32 v5, v1 5426; GFX908-NEXT: v_mov_b32_e32 v4, v0 5427; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5428; GFX908-NEXT: s_waitcnt vmcnt(0) 5429; GFX908-NEXT: buffer_wbinvl1 5430; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5431; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5432; GFX908-NEXT: v_mov_b32_e32 v1, v4 5433; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5434; GFX908-NEXT: s_cbranch_execnz .LBB17_1 5435; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5436; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5437; GFX908-NEXT: s_setpc_b64 s[30:31] 5438; 5439; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5440; GFX8: ; %bb.0: 5441; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5442; GFX8-NEXT: s_addk_i32 s20, 0x200 5443; GFX8-NEXT: s_and_b32 s4, s20, -4 5444; GFX8-NEXT: v_mov_b32_e32 v2, s4 5445; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5446; GFX8-NEXT: s_and_b32 s4, s20, 3 5447; GFX8-NEXT: s_lshl_b32 s6, s4, 3 5448; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 5449; GFX8-NEXT: s_not_b32 s7, s4 5450; GFX8-NEXT: s_mov_b64 s[4:5], 0 5451; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 5452; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start 5453; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5454; GFX8-NEXT: v_mov_b32_e32 v0, s6 5455; GFX8-NEXT: s_waitcnt vmcnt(0) 5456; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5457; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 5458; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 5459; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 5460; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 5461; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 5462; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 5463; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc 5464; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 5465; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5466; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 5467; GFX8-NEXT: v_mov_b32_e32 v5, v1 5468; GFX8-NEXT: v_mov_b32_e32 v4, v0 5469; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5470; GFX8-NEXT: s_waitcnt vmcnt(0) 5471; GFX8-NEXT: buffer_wbinvl1 5472; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5473; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5474; GFX8-NEXT: v_mov_b32_e32 v1, v4 5475; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5476; GFX8-NEXT: s_cbranch_execnz .LBB17_1 5477; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5478; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5479; GFX8-NEXT: s_setpc_b64 s[30:31] 5480; 5481; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5482; GFX7: ; %bb.0: 5483; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5484; GFX7-NEXT: s_addk_i32 s20, 0x200 5485; GFX7-NEXT: s_and_b32 s4, s20, -4 5486; GFX7-NEXT: v_mov_b32_e32 v2, s4 5487; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5488; GFX7-NEXT: s_and_b32 s4, s20, 3 5489; GFX7-NEXT: s_lshl_b32 s6, s4, 3 5490; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 5491; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 5492; GFX7-NEXT: s_not_b32 s7, s4 5493; GFX7-NEXT: s_mov_b64 s[4:5], 0 5494; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 5495; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start 5496; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5497; GFX7-NEXT: s_waitcnt vmcnt(0) 5498; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 5499; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5500; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 5501; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5502; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 5503; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 5504; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 5505; GFX7-NEXT: v_mov_b32_e32 v5, v1 5506; GFX7-NEXT: v_mov_b32_e32 v4, v0 5507; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5508; GFX7-NEXT: s_waitcnt vmcnt(0) 5509; GFX7-NEXT: buffer_wbinvl1 5510; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5511; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5512; GFX7-NEXT: v_mov_b32_e32 v1, v4 5513; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5514; GFX7-NEXT: s_cbranch_execnz .LBB17_1 5515; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5516; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5517; GFX7-NEXT: s_setpc_b64 s[30:31] 5518; 5519; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: 5520; GFX6: ; %bb.0: 5521; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5522; GFX6-NEXT: s_addk_i32 s20, 0x200 5523; GFX6-NEXT: s_and_b32 s4, s20, -4 5524; GFX6-NEXT: v_mov_b32_e32 v2, s4 5525; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 5526; GFX6-NEXT: s_and_b32 s4, s20, 3 5527; GFX6-NEXT: s_lshl_b32 s6, s4, 3 5528; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 5529; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 5530; GFX6-NEXT: s_not_b32 s7, s4 5531; GFX6-NEXT: s_mov_b64 s[4:5], 0 5532; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 5533; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start 5534; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 5535; GFX6-NEXT: s_waitcnt vmcnt(0) 5536; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 5537; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5538; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 5539; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5540; GFX6-NEXT: s_waitcnt expcnt(0) 5541; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 5542; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 5543; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 5544; GFX6-NEXT: v_mov_b32_e32 v5, v1 5545; GFX6-NEXT: v_mov_b32_e32 v4, v0 5546; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 5547; GFX6-NEXT: s_waitcnt vmcnt(0) 5548; GFX6-NEXT: buffer_wbinvl1 5549; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5550; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5551; GFX6-NEXT: v_mov_b32_e32 v1, v4 5552; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 5553; GFX6-NEXT: s_cbranch_execnz .LBB17_1 5554; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 5555; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 5556; GFX6-NEXT: s_waitcnt expcnt(0) 5557; GFX6-NEXT: s_setpc_b64 s[30:31] 5558 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 5559 %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 5560 ret void 5561} 5562 5563define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { 5564; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5565; GFX12: ; %bb.0: 5566; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5567; GFX12-NEXT: s_wait_expcnt 0x0 5568; GFX12-NEXT: s_wait_samplecnt 0x0 5569; GFX12-NEXT: s_wait_bvhcnt 0x0 5570; GFX12-NEXT: s_wait_kmcnt 0x0 5571; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 5572; GFX12-NEXT: s_mov_b32 s1, exec_lo 5573; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5574; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 5575; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 5576; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 5577; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5578; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 5579; GFX12-NEXT: v_not_b32_e32 v9, v6 5580; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5581; GFX12-NEXT: v_readfirstlane_b32 s4, v0 5582; GFX12-NEXT: v_readfirstlane_b32 s5, v1 5583; GFX12-NEXT: v_readfirstlane_b32 s6, v2 5584; GFX12-NEXT: v_readfirstlane_b32 s7, v3 5585; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5586; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 5587; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 5588; GFX12-NEXT: s_wait_alu 0xfffe 5589; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5590; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 5591; GFX12-NEXT: s_wait_alu 0xfffe 5592; GFX12-NEXT: s_and_saveexec_b32 s0, s0 5593; GFX12-NEXT: s_wait_loadcnt 0x0 5594; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen 5595; GFX12-NEXT: s_wait_alu 0xfffe 5596; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 5597; GFX12-NEXT: s_cbranch_execnz .LBB18_1 5598; GFX12-NEXT: ; %bb.2: 5599; GFX12-NEXT: s_mov_b32 exec_lo, s1 5600; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5601; GFX12-NEXT: s_mov_b32 s1, 0 5602; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start 5603; GFX12-NEXT: ; =>This Loop Header: Depth=1 5604; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 5605; GFX12-NEXT: s_wait_loadcnt 0x0 5606; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 5607; GFX12-NEXT: s_mov_b32 s2, exec_lo 5608; GFX12-NEXT: s_wait_storecnt 0x0 5609; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5610; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5611; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 5612; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 5613; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 5614; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 5615; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 5616; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 5617; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5618; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 5619; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 5620; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5621; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 5622; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 5623; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5624; GFX12-NEXT: v_mov_b32_e32 v4, v5 5625; GFX12-NEXT: v_mov_b32_e32 v5, v6 5626; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 5627; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 5628; GFX12-NEXT: v_readfirstlane_b32 s4, v0 5629; GFX12-NEXT: v_readfirstlane_b32 s5, v1 5630; GFX12-NEXT: v_readfirstlane_b32 s6, v2 5631; GFX12-NEXT: v_readfirstlane_b32 s7, v3 5632; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5633; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 5634; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 5635; GFX12-NEXT: s_wait_alu 0xfffe 5636; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5637; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 5638; GFX12-NEXT: s_wait_alu 0xfffe 5639; GFX12-NEXT: s_and_saveexec_b32 s0, s0 5640; GFX12-NEXT: s_wait_loadcnt 0x0 5641; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN 5642; GFX12-NEXT: s_wait_alu 0xfffe 5643; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 5644; GFX12-NEXT: s_cbranch_execnz .LBB18_4 5645; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 5646; GFX12-NEXT: s_mov_b32 exec_lo, s2 5647; GFX12-NEXT: s_wait_loadcnt 0x0 5648; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 5649; GFX12-NEXT: v_mov_b32_e32 v6, v4 5650; GFX12-NEXT: global_inv scope:SCOPE_DEV 5651; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 5652; GFX12-NEXT: s_wait_alu 0xfffe 5653; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5654; GFX12-NEXT: s_cbranch_execnz .LBB18_3 5655; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 5656; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 5657; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5658; GFX12-NEXT: s_wait_alu 0xfffe 5659; GFX12-NEXT: s_setpc_b64 s[30:31] 5660; 5661; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5662; GFX940: ; %bb.0: 5663; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5664; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 5665; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 5666; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 5667; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 5668; GFX940-NEXT: s_mov_b32 s0, 0xffff 5669; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 5670; GFX940-NEXT: v_not_b32_e32 v10, v4 5671; GFX940-NEXT: s_mov_b64 s[2:3], exec 5672; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5673; GFX940-NEXT: v_readfirstlane_b32 s4, v0 5674; GFX940-NEXT: v_readfirstlane_b32 s5, v1 5675; GFX940-NEXT: v_readfirstlane_b32 s6, v2 5676; GFX940-NEXT: v_readfirstlane_b32 s7, v3 5677; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 5678; GFX940-NEXT: s_nop 0 5679; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 5680; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 5681; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 5682; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen 5683; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 5684; GFX940-NEXT: s_cbranch_execnz .LBB18_1 5685; GFX940-NEXT: ; %bb.2: 5686; GFX940-NEXT: s_mov_b64 exec, s[2:3] 5687; GFX940-NEXT: s_mov_b64 s[2:3], 0 5688; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 5689; GFX940-NEXT: s_movk_i32 s10, 0x7fff 5690; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start 5691; GFX940-NEXT: ; =>This Loop Header: Depth=1 5692; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 5693; GFX940-NEXT: s_waitcnt vmcnt(0) 5694; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5695; GFX940-NEXT: s_mov_b64 s[8:9], exec 5696; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 5697; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 5698; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 5699; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 5700; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5701; GFX940-NEXT: buffer_wbl2 sc1 5702; GFX940-NEXT: s_nop 0 5703; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 5704; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5705; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 5706; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 5707; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 5708; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 5709; GFX940-NEXT: v_readfirstlane_b32 s4, v0 5710; GFX940-NEXT: v_readfirstlane_b32 s5, v1 5711; GFX940-NEXT: v_readfirstlane_b32 s6, v2 5712; GFX940-NEXT: v_readfirstlane_b32 s7, v3 5713; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 5714; GFX940-NEXT: s_nop 0 5715; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 5716; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 5717; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 5718; GFX940-NEXT: s_waitcnt vmcnt(0) 5719; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 5720; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 5721; GFX940-NEXT: s_cbranch_execnz .LBB18_4 5722; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 5723; GFX940-NEXT: s_mov_b64 exec, s[8:9] 5724; GFX940-NEXT: s_waitcnt vmcnt(0) 5725; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 5726; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 5727; GFX940-NEXT: v_mov_b32_e32 v7, v4 5728; GFX940-NEXT: buffer_inv sc1 5729; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 5730; GFX940-NEXT: s_cbranch_execnz .LBB18_3 5731; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 5732; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 5733; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 5734; GFX940-NEXT: s_setpc_b64 s[30:31] 5735; 5736; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5737; GFX11: ; %bb.0: 5738; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5739; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 5740; GFX11-NEXT: s_mov_b32 s1, 0 5741; GFX11-NEXT: s_mov_b32 s2, exec_lo 5742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5743; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 5744; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 5745; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 5746; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5747; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 5748; GFX11-NEXT: v_not_b32_e32 v9, v6 5749; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5750; GFX11-NEXT: v_readfirstlane_b32 s4, v0 5751; GFX11-NEXT: v_readfirstlane_b32 s5, v1 5752; GFX11-NEXT: v_readfirstlane_b32 s6, v2 5753; GFX11-NEXT: v_readfirstlane_b32 s7, v3 5754; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5755; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 5756; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 5757; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 5758; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 5759; GFX11-NEXT: s_and_saveexec_b32 s0, s0 5760; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen 5761; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 5762; GFX11-NEXT: s_cbranch_execnz .LBB18_1 5763; GFX11-NEXT: ; %bb.2: 5764; GFX11-NEXT: s_mov_b32 exec_lo, s2 5765; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5766; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 5767; GFX11-NEXT: .p2align 6 5768; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start 5769; GFX11-NEXT: ; =>This Loop Header: Depth=1 5770; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 5771; GFX11-NEXT: s_waitcnt vmcnt(0) 5772; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 5773; GFX11-NEXT: s_mov_b32 s2, exec_lo 5774; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5775; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5776; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5777; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 5778; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 5779; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 5780; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 5781; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 5782; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 5783; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5784; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 5785; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 5786; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5787; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 5788; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 5789; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5790; GFX11-NEXT: v_mov_b32_e32 v4, v5 5791; GFX11-NEXT: v_mov_b32_e32 v5, v6 5792; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 5793; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 5794; GFX11-NEXT: v_readfirstlane_b32 s4, v0 5795; GFX11-NEXT: v_readfirstlane_b32 s5, v1 5796; GFX11-NEXT: v_readfirstlane_b32 s6, v2 5797; GFX11-NEXT: v_readfirstlane_b32 s7, v3 5798; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5799; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 5800; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 5801; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 5802; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 5803; GFX11-NEXT: s_and_saveexec_b32 s0, s0 5804; GFX11-NEXT: s_waitcnt vmcnt(0) 5805; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc 5806; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 5807; GFX11-NEXT: s_cbranch_execnz .LBB18_4 5808; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 5809; GFX11-NEXT: s_mov_b32 exec_lo, s2 5810; GFX11-NEXT: s_waitcnt vmcnt(0) 5811; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 5812; GFX11-NEXT: v_mov_b32_e32 v6, v4 5813; GFX11-NEXT: buffer_gl1_inv 5814; GFX11-NEXT: buffer_gl0_inv 5815; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5816; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5817; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5818; GFX11-NEXT: s_cbranch_execnz .LBB18_3 5819; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 5820; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 5821; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5822; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5823; GFX11-NEXT: s_setpc_b64 s[30:31] 5824; 5825; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5826; GFX10: ; %bb.0: 5827; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5828; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 5829; GFX10-NEXT: s_mov_b32 s5, 0 5830; GFX10-NEXT: s_mov_b32 s6, exec_lo 5831; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 5832; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 5833; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 5834; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 5835; GFX10-NEXT: v_not_b32_e32 v9, v6 5836; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5837; GFX10-NEXT: v_readfirstlane_b32 s8, v0 5838; GFX10-NEXT: v_readfirstlane_b32 s9, v1 5839; GFX10-NEXT: v_readfirstlane_b32 s10, v2 5840; GFX10-NEXT: v_readfirstlane_b32 s11, v3 5841; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 5842; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 5843; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 5844; GFX10-NEXT: s_and_saveexec_b32 s4, s4 5845; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5846; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5847; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 5848; GFX10-NEXT: s_cbranch_execnz .LBB18_1 5849; GFX10-NEXT: ; %bb.2: 5850; GFX10-NEXT: s_mov_b32 exec_lo, s6 5851; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5852; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start 5853; GFX10-NEXT: ; =>This Loop Header: Depth=1 5854; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 5855; GFX10-NEXT: s_waitcnt vmcnt(0) 5856; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5857; GFX10-NEXT: s_mov_b32 s6, exec_lo 5858; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5859; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 5860; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 5861; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 5862; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 5863; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 5864; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 5865; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5866; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 5867; GFX10-NEXT: v_mov_b32_e32 v4, v5 5868; GFX10-NEXT: v_mov_b32_e32 v5, v6 5869; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 5870; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 5871; GFX10-NEXT: v_readfirstlane_b32 s8, v0 5872; GFX10-NEXT: v_readfirstlane_b32 s9, v1 5873; GFX10-NEXT: v_readfirstlane_b32 s10, v2 5874; GFX10-NEXT: v_readfirstlane_b32 s11, v3 5875; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 5876; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 5877; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 5878; GFX10-NEXT: s_and_saveexec_b32 s4, s4 5879; GFX10-NEXT: s_waitcnt vmcnt(0) 5880; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5881; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5882; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 5883; GFX10-NEXT: s_cbranch_execnz .LBB18_4 5884; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 5885; GFX10-NEXT: s_mov_b32 exec_lo, s6 5886; GFX10-NEXT: s_waitcnt vmcnt(0) 5887; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 5888; GFX10-NEXT: v_mov_b32_e32 v6, v4 5889; GFX10-NEXT: buffer_gl1_inv 5890; GFX10-NEXT: buffer_gl0_inv 5891; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5892; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5893; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5894; GFX10-NEXT: s_cbranch_execnz .LBB18_3 5895; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 5896; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5897; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5898; GFX10-NEXT: s_setpc_b64 s[30:31] 5899; 5900; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5901; GFX90A: ; %bb.0: 5902; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5903; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 5904; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 5905; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 5906; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 5907; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5908; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 5909; GFX90A-NEXT: v_not_b32_e32 v10, v4 5910; GFX90A-NEXT: s_mov_b64 s[6:7], exec 5911; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5912; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 5913; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 5914; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 5915; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 5916; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5917; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5918; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5919; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5920; GFX90A-NEXT: s_nop 0 5921; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen 5922; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 5923; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 5924; GFX90A-NEXT: ; %bb.2: 5925; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 5926; GFX90A-NEXT: s_mov_b64 s[6:7], 0 5927; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 5928; GFX90A-NEXT: s_movk_i32 s14, 0x7fff 5929; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start 5930; GFX90A-NEXT: ; =>This Loop Header: Depth=1 5931; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 5932; GFX90A-NEXT: s_waitcnt vmcnt(0) 5933; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5934; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 5935; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 5936; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 5937; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 5938; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5939; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 5940; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5941; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 5942; GFX90A-NEXT: s_mov_b64 s[12:13], exec 5943; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 5944; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 5945; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 5946; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 5947; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 5948; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 5949; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 5950; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5951; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5952; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5953; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5954; GFX90A-NEXT: s_waitcnt vmcnt(0) 5955; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc 5956; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 5957; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 5958; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 5959; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 5960; GFX90A-NEXT: s_waitcnt vmcnt(0) 5961; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 5962; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5963; GFX90A-NEXT: v_mov_b32_e32 v7, v4 5964; GFX90A-NEXT: buffer_wbinvl1 5965; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 5966; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 5967; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 5968; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5969; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 5970; GFX90A-NEXT: s_setpc_b64 s[30:31] 5971; 5972; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5973; GFX908: ; %bb.0: 5974; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5975; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 5976; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 5977; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 5978; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 5979; GFX908-NEXT: s_mov_b32 s4, 0xffff 5980; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 5981; GFX908-NEXT: v_not_b32_e32 v9, v4 5982; GFX908-NEXT: s_mov_b64 s[6:7], exec 5983; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 5984; GFX908-NEXT: v_readfirstlane_b32 s8, v0 5985; GFX908-NEXT: v_readfirstlane_b32 s9, v1 5986; GFX908-NEXT: v_readfirstlane_b32 s10, v2 5987; GFX908-NEXT: v_readfirstlane_b32 s11, v3 5988; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5989; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5990; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5991; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5992; GFX908-NEXT: s_nop 0 5993; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5994; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 5995; GFX908-NEXT: s_cbranch_execnz .LBB18_1 5996; GFX908-NEXT: ; %bb.2: 5997; GFX908-NEXT: s_mov_b64 exec, s[6:7] 5998; GFX908-NEXT: s_mov_b64 s[6:7], 0 5999; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 6000; GFX908-NEXT: s_movk_i32 s14, 0x7fff 6001; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start 6002; GFX908-NEXT: ; =>This Loop Header: Depth=1 6003; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 6004; GFX908-NEXT: s_waitcnt vmcnt(0) 6005; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 6006; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 6007; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 6008; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 6009; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 6010; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 6011; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc 6012; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 6013; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 6014; GFX908-NEXT: v_mov_b32_e32 v4, v5 6015; GFX908-NEXT: s_mov_b64 s[12:13], exec 6016; GFX908-NEXT: v_mov_b32_e32 v5, v6 6017; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6018; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 6019; GFX908-NEXT: v_readfirstlane_b32 s8, v0 6020; GFX908-NEXT: v_readfirstlane_b32 s9, v1 6021; GFX908-NEXT: v_readfirstlane_b32 s10, v2 6022; GFX908-NEXT: v_readfirstlane_b32 s11, v3 6023; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6024; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6025; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6026; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6027; GFX908-NEXT: s_waitcnt vmcnt(0) 6028; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 6029; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 6030; GFX908-NEXT: s_cbranch_execnz .LBB18_4 6031; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6032; GFX908-NEXT: s_mov_b64 exec, s[12:13] 6033; GFX908-NEXT: s_waitcnt vmcnt(0) 6034; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6035; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6036; GFX908-NEXT: v_mov_b32_e32 v6, v4 6037; GFX908-NEXT: buffer_wbinvl1 6038; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 6039; GFX908-NEXT: s_cbranch_execnz .LBB18_3 6040; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 6041; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 6042; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 6043; GFX908-NEXT: s_setpc_b64 s[30:31] 6044; 6045; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 6046; GFX8: ; %bb.0: 6047; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6048; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 6049; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 6050; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 6051; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 6052; GFX8-NEXT: s_mov_b32 s4, 0xffff 6053; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 6054; GFX8-NEXT: v_not_b32_e32 v9, v4 6055; GFX8-NEXT: s_mov_b64 s[6:7], exec 6056; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6057; GFX8-NEXT: v_readfirstlane_b32 s8, v0 6058; GFX8-NEXT: v_readfirstlane_b32 s9, v1 6059; GFX8-NEXT: v_readfirstlane_b32 s10, v2 6060; GFX8-NEXT: v_readfirstlane_b32 s11, v3 6061; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6062; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6063; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6064; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6065; GFX8-NEXT: s_nop 0 6066; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 6067; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 6068; GFX8-NEXT: s_cbranch_execnz .LBB18_1 6069; GFX8-NEXT: ; %bb.2: 6070; GFX8-NEXT: s_mov_b64 exec, s[6:7] 6071; GFX8-NEXT: s_mov_b64 s[6:7], 0 6072; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 6073; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start 6074; GFX8-NEXT: ; =>This Loop Header: Depth=1 6075; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 6076; GFX8-NEXT: s_waitcnt vmcnt(0) 6077; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 6078; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 6079; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 6080; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 6081; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 6082; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 6083; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 6084; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc 6085; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 6086; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 6087; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 6088; GFX8-NEXT: v_mov_b32_e32 v4, v5 6089; GFX8-NEXT: s_mov_b64 s[12:13], exec 6090; GFX8-NEXT: v_mov_b32_e32 v5, v6 6091; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6092; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 6093; GFX8-NEXT: v_readfirstlane_b32 s8, v0 6094; GFX8-NEXT: v_readfirstlane_b32 s9, v1 6095; GFX8-NEXT: v_readfirstlane_b32 s10, v2 6096; GFX8-NEXT: v_readfirstlane_b32 s11, v3 6097; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6098; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6099; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6100; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6101; GFX8-NEXT: s_waitcnt vmcnt(0) 6102; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 6103; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 6104; GFX8-NEXT: s_cbranch_execnz .LBB18_4 6105; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6106; GFX8-NEXT: s_mov_b64 exec, s[12:13] 6107; GFX8-NEXT: s_waitcnt vmcnt(0) 6108; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6109; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6110; GFX8-NEXT: v_mov_b32_e32 v6, v4 6111; GFX8-NEXT: buffer_wbinvl1 6112; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 6113; GFX8-NEXT: s_cbranch_execnz .LBB18_3 6114; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 6115; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 6116; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 6117; GFX8-NEXT: s_setpc_b64 s[30:31] 6118; 6119; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 6120; GFX7: ; %bb.0: 6121; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6122; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 6123; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 6124; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 6125; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 6126; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 6127; GFX7-NEXT: v_not_b32_e32 v9, v4 6128; GFX7-NEXT: s_mov_b64 s[6:7], exec 6129; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6130; GFX7-NEXT: v_readfirstlane_b32 s8, v0 6131; GFX7-NEXT: v_readfirstlane_b32 s9, v1 6132; GFX7-NEXT: v_readfirstlane_b32 s10, v2 6133; GFX7-NEXT: v_readfirstlane_b32 s11, v3 6134; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6135; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6136; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6137; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6138; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 6139; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 6140; GFX7-NEXT: s_cbranch_execnz .LBB18_1 6141; GFX7-NEXT: ; %bb.2: 6142; GFX7-NEXT: s_mov_b64 exec, s[6:7] 6143; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 6144; GFX7-NEXT: s_mov_b64 s[6:7], 0 6145; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 6146; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start 6147; GFX7-NEXT: ; =>This Loop Header: Depth=1 6148; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 6149; GFX7-NEXT: s_waitcnt vmcnt(0) 6150; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 6151; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 6152; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 6153; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 6154; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 6155; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 6156; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 6157; GFX7-NEXT: v_mov_b32_e32 v4, v5 6158; GFX7-NEXT: s_mov_b64 s[12:13], exec 6159; GFX7-NEXT: v_mov_b32_e32 v5, v6 6160; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6161; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 6162; GFX7-NEXT: v_readfirstlane_b32 s8, v0 6163; GFX7-NEXT: v_readfirstlane_b32 s9, v1 6164; GFX7-NEXT: v_readfirstlane_b32 s10, v2 6165; GFX7-NEXT: v_readfirstlane_b32 s11, v3 6166; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6167; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6168; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6169; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6170; GFX7-NEXT: s_waitcnt vmcnt(0) 6171; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 6172; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 6173; GFX7-NEXT: s_cbranch_execnz .LBB18_4 6174; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6175; GFX7-NEXT: s_mov_b64 exec, s[12:13] 6176; GFX7-NEXT: s_waitcnt vmcnt(0) 6177; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6178; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6179; GFX7-NEXT: v_mov_b32_e32 v6, v4 6180; GFX7-NEXT: buffer_wbinvl1 6181; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 6182; GFX7-NEXT: s_cbranch_execnz .LBB18_3 6183; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 6184; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 6185; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 6186; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 6187; GFX7-NEXT: s_setpc_b64 s[30:31] 6188; 6189; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 6190; GFX6: ; %bb.0: 6191; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6192; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 6193; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 6194; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 6195; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 6196; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 6197; GFX6-NEXT: v_not_b32_e32 v9, v4 6198; GFX6-NEXT: s_mov_b64 s[6:7], exec 6199; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6200; GFX6-NEXT: v_readfirstlane_b32 s8, v0 6201; GFX6-NEXT: v_readfirstlane_b32 s9, v1 6202; GFX6-NEXT: v_readfirstlane_b32 s10, v2 6203; GFX6-NEXT: v_readfirstlane_b32 s11, v3 6204; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6205; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6206; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6207; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6208; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 6209; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 6210; GFX6-NEXT: s_cbranch_execnz .LBB18_1 6211; GFX6-NEXT: ; %bb.2: 6212; GFX6-NEXT: s_mov_b64 exec, s[6:7] 6213; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 6214; GFX6-NEXT: s_mov_b64 s[6:7], 0 6215; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 6216; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start 6217; GFX6-NEXT: ; =>This Loop Header: Depth=1 6218; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 6219; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6220; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 6221; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 6222; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 6223; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 6224; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 6225; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 6226; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 6227; GFX6-NEXT: v_mov_b32_e32 v4, v5 6228; GFX6-NEXT: s_mov_b64 s[12:13], exec 6229; GFX6-NEXT: v_mov_b32_e32 v5, v6 6230; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6231; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 6232; GFX6-NEXT: v_readfirstlane_b32 s8, v0 6233; GFX6-NEXT: v_readfirstlane_b32 s9, v1 6234; GFX6-NEXT: v_readfirstlane_b32 s10, v2 6235; GFX6-NEXT: v_readfirstlane_b32 s11, v3 6236; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6237; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6238; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6239; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6240; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6241; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 6242; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 6243; GFX6-NEXT: s_cbranch_execnz .LBB18_4 6244; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6245; GFX6-NEXT: s_mov_b64 exec, s[12:13] 6246; GFX6-NEXT: s_waitcnt vmcnt(0) 6247; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6248; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6249; GFX6-NEXT: v_mov_b32_e32 v6, v4 6250; GFX6-NEXT: buffer_wbinvl1 6251; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 6252; GFX6-NEXT: s_cbranch_execnz .LBB18_3 6253; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 6254; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 6255; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 6256; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 6257; GFX6-NEXT: s_waitcnt expcnt(0) 6258; GFX6-NEXT: s_setpc_b64 s[30:31] 6259 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 6260 %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6261 ret bfloat %result 6262} 6263 6264; -------------------------------------------------------------------- 6265; <2 x half> 6266; -------------------------------------------------------------------- 6267 6268define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 6269; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6270; GFX12: ; %bb.0: 6271; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6272; GFX12-NEXT: s_wait_expcnt 0x0 6273; GFX12-NEXT: s_wait_samplecnt 0x0 6274; GFX12-NEXT: s_wait_bvhcnt 0x0 6275; GFX12-NEXT: s_wait_kmcnt 0x0 6276; GFX12-NEXT: v_mov_b32_e32 v1, s16 6277; GFX12-NEXT: s_wait_storecnt 0x0 6278; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 6279; GFX12-NEXT: s_wait_loadcnt 0x0 6280; GFX12-NEXT: global_inv scope:SCOPE_DEV 6281; GFX12-NEXT: s_setpc_b64 s[30:31] 6282; 6283; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6284; GFX940: ; %bb.0: 6285; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6286; GFX940-NEXT: v_mov_b32_e32 v1, s16 6287; GFX940-NEXT: buffer_wbl2 sc1 6288; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 6289; GFX940-NEXT: s_waitcnt vmcnt(0) 6290; GFX940-NEXT: buffer_inv sc1 6291; GFX940-NEXT: s_setpc_b64 s[30:31] 6292; 6293; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6294; GFX11: ; %bb.0: 6295; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6296; GFX11-NEXT: s_add_i32 s4, s16, 0x400 6297; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6298; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 6299; GFX11-NEXT: v_mov_b32_e32 v0, s16 6300; GFX11-NEXT: s_mov_b32 s4, 0 6301; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 6302; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start 6303; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6304; GFX11-NEXT: s_waitcnt vmcnt(0) 6305; GFX11-NEXT: v_mov_b32_e32 v5, v0 6306; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6307; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6308; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 6309; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 6310; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 6311; GFX11-NEXT: s_waitcnt vmcnt(0) 6312; GFX11-NEXT: buffer_gl1_inv 6313; GFX11-NEXT: buffer_gl0_inv 6314; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 6315; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 6316; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6317; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 6318; GFX11-NEXT: s_cbranch_execnz .LBB19_1 6319; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6320; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 6321; GFX11-NEXT: s_setpc_b64 s[30:31] 6322; 6323; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6324; GFX10: ; %bb.0: 6325; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6326; GFX10-NEXT: v_mov_b32_e32 v2, v0 6327; GFX10-NEXT: v_mov_b32_e32 v0, s20 6328; GFX10-NEXT: s_add_i32 s4, s20, 0x400 6329; GFX10-NEXT: v_mov_b32_e32 v3, s4 6330; GFX10-NEXT: s_mov_b32 s4, 0 6331; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6332; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start 6333; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6334; GFX10-NEXT: s_waitcnt vmcnt(0) 6335; GFX10-NEXT: v_mov_b32_e32 v5, v0 6336; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6337; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 6338; GFX10-NEXT: v_mov_b32_e32 v0, v4 6339; GFX10-NEXT: v_mov_b32_e32 v1, v5 6340; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 6341; GFX10-NEXT: s_waitcnt vmcnt(0) 6342; GFX10-NEXT: buffer_gl1_inv 6343; GFX10-NEXT: buffer_gl0_inv 6344; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 6345; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6346; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6347; GFX10-NEXT: s_cbranch_execnz .LBB19_1 6348; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6349; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6350; GFX10-NEXT: s_setpc_b64 s[30:31] 6351; 6352; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6353; GFX90A: ; %bb.0: 6354; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6355; GFX90A-NEXT: v_mov_b32_e32 v1, s20 6356; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 glc 6357; GFX90A-NEXT: s_waitcnt vmcnt(0) 6358; GFX90A-NEXT: buffer_wbinvl1 6359; GFX90A-NEXT: s_setpc_b64 s[30:31] 6360; 6361; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6362; GFX908: ; %bb.0: 6363; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6364; GFX908-NEXT: v_mov_b32_e32 v2, v0 6365; GFX908-NEXT: v_mov_b32_e32 v0, s20 6366; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6367; GFX908-NEXT: s_add_i32 s6, s20, 0x400 6368; GFX908-NEXT: s_mov_b64 s[4:5], 0 6369; GFX908-NEXT: v_mov_b32_e32 v3, s6 6370; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start 6371; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6372; GFX908-NEXT: s_waitcnt vmcnt(0) 6373; GFX908-NEXT: v_mov_b32_e32 v5, v0 6374; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 6375; GFX908-NEXT: v_mov_b32_e32 v0, v4 6376; GFX908-NEXT: v_mov_b32_e32 v1, v5 6377; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 6378; GFX908-NEXT: s_waitcnt vmcnt(0) 6379; GFX908-NEXT: buffer_wbinvl1 6380; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 6381; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6382; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6383; GFX908-NEXT: s_cbranch_execnz .LBB19_1 6384; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6385; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6386; GFX908-NEXT: s_setpc_b64 s[30:31] 6387; 6388; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6389; GFX8: ; %bb.0: 6390; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6391; GFX8-NEXT: v_mov_b32_e32 v2, v0 6392; GFX8-NEXT: v_mov_b32_e32 v0, s20 6393; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6394; GFX8-NEXT: s_add_i32 s6, s20, 0x400 6395; GFX8-NEXT: s_mov_b64 s[4:5], 0 6396; GFX8-NEXT: v_mov_b32_e32 v3, s6 6397; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start 6398; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6399; GFX8-NEXT: s_waitcnt vmcnt(0) 6400; GFX8-NEXT: v_mov_b32_e32 v5, v0 6401; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 6402; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 6403; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 6404; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 6405; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 6406; GFX8-NEXT: v_mov_b32_e32 v0, v4 6407; GFX8-NEXT: v_mov_b32_e32 v1, v5 6408; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 6409; GFX8-NEXT: s_waitcnt vmcnt(0) 6410; GFX8-NEXT: buffer_wbinvl1 6411; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 6412; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6413; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6414; GFX8-NEXT: s_cbranch_execnz .LBB19_1 6415; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6416; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6417; GFX8-NEXT: s_setpc_b64 s[30:31] 6418; 6419; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6420; GFX7: ; %bb.0: 6421; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6422; GFX7-NEXT: v_mov_b32_e32 v2, s20 6423; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 6424; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 6425; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 6426; GFX7-NEXT: s_add_i32 s6, s20, 0x400 6427; GFX7-NEXT: s_mov_b64 s[4:5], 0 6428; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 6429; GFX7-NEXT: s_waitcnt vmcnt(0) 6430; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 6431; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 6432; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 6433; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6434; GFX7-NEXT: v_mov_b32_e32 v4, s6 6435; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start 6436; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6437; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 6438; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 6439; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 6440; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 6441; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6442; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 6443; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 6444; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6445; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 6446; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 6447; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 6448; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 6449; GFX7-NEXT: v_mov_b32_e32 v8, v6 6450; GFX7-NEXT: v_mov_b32_e32 v7, v5 6451; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 6452; GFX7-NEXT: s_waitcnt vmcnt(0) 6453; GFX7-NEXT: buffer_wbinvl1 6454; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 6455; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 6456; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 6457; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 6458; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6459; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6460; GFX7-NEXT: s_cbranch_execnz .LBB19_1 6461; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6462; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6463; GFX7-NEXT: s_setpc_b64 s[30:31] 6464; 6465; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 6466; GFX6: ; %bb.0: 6467; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6468; GFX6-NEXT: v_mov_b32_e32 v2, s20 6469; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 6470; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 6471; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 6472; GFX6-NEXT: s_add_i32 s6, s20, 0x400 6473; GFX6-NEXT: s_mov_b64 s[4:5], 0 6474; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 6475; GFX6-NEXT: s_waitcnt vmcnt(0) 6476; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 6477; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 6478; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 6479; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 6480; GFX6-NEXT: v_mov_b32_e32 v4, s6 6481; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start 6482; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 6483; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 6484; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 6485; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 6486; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 6487; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6488; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 6489; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 6490; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 6491; GFX6-NEXT: s_waitcnt expcnt(0) 6492; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 6493; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 6494; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 6495; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 6496; GFX6-NEXT: v_mov_b32_e32 v8, v6 6497; GFX6-NEXT: v_mov_b32_e32 v7, v5 6498; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 6499; GFX6-NEXT: s_waitcnt vmcnt(0) 6500; GFX6-NEXT: buffer_wbinvl1 6501; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 6502; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 6503; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 6504; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 6505; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6506; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 6507; GFX6-NEXT: s_cbranch_execnz .LBB19_1 6508; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 6509; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 6510; GFX6-NEXT: s_waitcnt expcnt(0) 6511; GFX6-NEXT: s_setpc_b64 s[30:31] 6512 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 6513 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6514 ret <2 x half> %result 6515} 6516 6517define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 6518; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6519; GFX12: ; %bb.0: 6520; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6521; GFX12-NEXT: s_wait_expcnt 0x0 6522; GFX12-NEXT: s_wait_samplecnt 0x0 6523; GFX12-NEXT: s_wait_bvhcnt 0x0 6524; GFX12-NEXT: s_wait_kmcnt 0x0 6525; GFX12-NEXT: v_mov_b32_e32 v1, s16 6526; GFX12-NEXT: s_wait_storecnt 0x0 6527; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 6528; GFX12-NEXT: s_wait_storecnt 0x0 6529; GFX12-NEXT: global_inv scope:SCOPE_DEV 6530; GFX12-NEXT: s_setpc_b64 s[30:31] 6531; 6532; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6533; GFX940: ; %bb.0: 6534; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6535; GFX940-NEXT: v_mov_b32_e32 v1, s16 6536; GFX940-NEXT: buffer_wbl2 sc1 6537; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 6538; GFX940-NEXT: s_waitcnt vmcnt(0) 6539; GFX940-NEXT: buffer_inv sc1 6540; GFX940-NEXT: s_setpc_b64 s[30:31] 6541; 6542; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6543; GFX11: ; %bb.0: 6544; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6545; GFX11-NEXT: v_mov_b32_e32 v1, s16 6546; GFX11-NEXT: s_add_i32 s4, s16, 0x400 6547; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6548; GFX11-NEXT: v_mov_b32_e32 v3, s4 6549; GFX11-NEXT: s_mov_b32 s4, 0 6550; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 6551; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start 6552; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6553; GFX11-NEXT: s_waitcnt vmcnt(0) 6554; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 6555; GFX11-NEXT: v_mov_b32_e32 v5, v2 6556; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6557; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 6558; GFX11-NEXT: v_mov_b32_e32 v4, v1 6559; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc 6560; GFX11-NEXT: s_waitcnt vmcnt(0) 6561; GFX11-NEXT: buffer_gl1_inv 6562; GFX11-NEXT: buffer_gl0_inv 6563; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 6564; GFX11-NEXT: v_mov_b32_e32 v2, v4 6565; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 6566; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6567; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 6568; GFX11-NEXT: s_cbranch_execnz .LBB20_1 6569; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6570; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 6571; GFX11-NEXT: s_setpc_b64 s[30:31] 6572; 6573; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6574; GFX10: ; %bb.0: 6575; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6576; GFX10-NEXT: v_mov_b32_e32 v1, s20 6577; GFX10-NEXT: s_add_i32 s4, s20, 0x400 6578; GFX10-NEXT: v_mov_b32_e32 v3, s4 6579; GFX10-NEXT: s_mov_b32 s4, 0 6580; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 6581; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start 6582; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6583; GFX10-NEXT: s_waitcnt vmcnt(0) 6584; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 6585; GFX10-NEXT: v_mov_b32_e32 v5, v2 6586; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6587; GFX10-NEXT: v_mov_b32_e32 v4, v1 6588; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 6589; GFX10-NEXT: s_waitcnt vmcnt(0) 6590; GFX10-NEXT: buffer_gl1_inv 6591; GFX10-NEXT: buffer_gl0_inv 6592; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 6593; GFX10-NEXT: v_mov_b32_e32 v2, v4 6594; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6595; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6596; GFX10-NEXT: s_cbranch_execnz .LBB20_1 6597; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6598; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6599; GFX10-NEXT: s_setpc_b64 s[30:31] 6600; 6601; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6602; GFX90A: ; %bb.0: 6603; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6604; GFX90A-NEXT: v_mov_b32_e32 v1, s20 6605; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 6606; GFX90A-NEXT: s_waitcnt vmcnt(0) 6607; GFX90A-NEXT: buffer_wbinvl1 6608; GFX90A-NEXT: s_setpc_b64 s[30:31] 6609; 6610; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6611; GFX908: ; %bb.0: 6612; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6613; GFX908-NEXT: v_mov_b32_e32 v1, s20 6614; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 6615; GFX908-NEXT: s_waitcnt vmcnt(0) 6616; GFX908-NEXT: buffer_wbinvl1 6617; GFX908-NEXT: s_setpc_b64 s[30:31] 6618; 6619; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6620; GFX8: ; %bb.0: 6621; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6622; GFX8-NEXT: v_mov_b32_e32 v1, s20 6623; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 6624; GFX8-NEXT: s_add_i32 s6, s20, 0x400 6625; GFX8-NEXT: s_mov_b64 s[4:5], 0 6626; GFX8-NEXT: v_mov_b32_e32 v3, s6 6627; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start 6628; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6629; GFX8-NEXT: s_waitcnt vmcnt(0) 6630; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 6631; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 6632; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 6633; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6634; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 6635; GFX8-NEXT: v_mov_b32_e32 v5, v2 6636; GFX8-NEXT: v_mov_b32_e32 v4, v1 6637; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 6638; GFX8-NEXT: s_waitcnt vmcnt(0) 6639; GFX8-NEXT: buffer_wbinvl1 6640; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 6641; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6642; GFX8-NEXT: v_mov_b32_e32 v2, v4 6643; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6644; GFX8-NEXT: s_cbranch_execnz .LBB20_1 6645; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6646; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6647; GFX8-NEXT: s_setpc_b64 s[30:31] 6648; 6649; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6650; GFX7: ; %bb.0: 6651; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6652; GFX7-NEXT: v_mov_b32_e32 v2, s20 6653; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 6654; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 6655; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 6656; GFX7-NEXT: s_add_i32 s6, s20, 0x400 6657; GFX7-NEXT: s_mov_b64 s[4:5], 0 6658; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 6659; GFX7-NEXT: s_waitcnt vmcnt(0) 6660; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 6661; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 6662; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 6663; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 6664; GFX7-NEXT: v_mov_b32_e32 v2, s6 6665; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start 6666; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6667; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 6668; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 6669; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 6670; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 6671; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 6672; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 6673; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 6674; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 6675; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 6676; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 6677; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 6678; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 6679; GFX7-NEXT: v_mov_b32_e32 v7, v5 6680; GFX7-NEXT: v_mov_b32_e32 v6, v4 6681; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 6682; GFX7-NEXT: s_waitcnt vmcnt(0) 6683; GFX7-NEXT: buffer_wbinvl1 6684; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 6685; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 6686; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 6687; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 6688; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6689; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6690; GFX7-NEXT: s_cbranch_execnz .LBB20_1 6691; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6692; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6693; GFX7-NEXT: s_setpc_b64 s[30:31] 6694; 6695; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 6696; GFX6: ; %bb.0: 6697; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6698; GFX6-NEXT: v_mov_b32_e32 v2, s20 6699; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 6700; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 6701; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 6702; GFX6-NEXT: s_add_i32 s6, s20, 0x400 6703; GFX6-NEXT: s_mov_b64 s[4:5], 0 6704; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 6705; GFX6-NEXT: s_waitcnt vmcnt(0) 6706; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 6707; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 6708; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 6709; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 6710; GFX6-NEXT: v_mov_b32_e32 v2, s6 6711; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start 6712; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 6713; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 6714; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 6715; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 6716; GFX6-NEXT: s_waitcnt expcnt(0) 6717; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 6718; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 6719; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 6720; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 6721; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 6722; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 6723; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 6724; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 6725; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 6726; GFX6-NEXT: v_mov_b32_e32 v7, v5 6727; GFX6-NEXT: v_mov_b32_e32 v6, v4 6728; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 6729; GFX6-NEXT: s_waitcnt vmcnt(0) 6730; GFX6-NEXT: buffer_wbinvl1 6731; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 6732; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 6733; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 6734; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 6735; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6736; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 6737; GFX6-NEXT: s_cbranch_execnz .LBB20_1 6738; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 6739; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 6740; GFX6-NEXT: s_waitcnt expcnt(0) 6741; GFX6-NEXT: s_setpc_b64 s[30:31] 6742 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 6743 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6744 ret void 6745} 6746 6747define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { 6748; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6749; GFX12: ; %bb.0: 6750; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6751; GFX12-NEXT: s_wait_expcnt 0x0 6752; GFX12-NEXT: s_wait_samplecnt 0x0 6753; GFX12-NEXT: s_wait_bvhcnt 0x0 6754; GFX12-NEXT: s_wait_kmcnt 0x0 6755; GFX12-NEXT: s_mov_b32 s1, exec_lo 6756; GFX12-NEXT: s_wait_storecnt 0x0 6757; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6758; GFX12-NEXT: v_readfirstlane_b32 s4, v0 6759; GFX12-NEXT: v_readfirstlane_b32 s5, v1 6760; GFX12-NEXT: v_readfirstlane_b32 s6, v2 6761; GFX12-NEXT: v_readfirstlane_b32 s7, v3 6762; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6763; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6764; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6765; GFX12-NEXT: s_wait_alu 0xfffe 6766; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6767; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 6768; GFX12-NEXT: s_wait_alu 0xfffe 6769; GFX12-NEXT: s_and_saveexec_b32 s0, s0 6770; GFX12-NEXT: s_wait_loadcnt 0x0 6771; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN 6772; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 6773; GFX12-NEXT: ; implicit-def: $vgpr4 6774; GFX12-NEXT: s_wait_alu 0xfffe 6775; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6776; GFX12-NEXT: s_cbranch_execnz .LBB21_1 6777; GFX12-NEXT: ; %bb.2: 6778; GFX12-NEXT: s_mov_b32 exec_lo, s1 6779; GFX12-NEXT: s_wait_loadcnt 0x0 6780; GFX12-NEXT: v_mov_b32_e32 v0, v5 6781; GFX12-NEXT: global_inv scope:SCOPE_DEV 6782; GFX12-NEXT: s_wait_alu 0xfffe 6783; GFX12-NEXT: s_setpc_b64 s[30:31] 6784; 6785; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6786; GFX940: ; %bb.0: 6787; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6788; GFX940-NEXT: s_mov_b64 s[2:3], exec 6789; GFX940-NEXT: buffer_wbl2 sc1 6790; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6791; GFX940-NEXT: v_readfirstlane_b32 s4, v0 6792; GFX940-NEXT: v_readfirstlane_b32 s5, v1 6793; GFX940-NEXT: v_readfirstlane_b32 s6, v2 6794; GFX940-NEXT: v_readfirstlane_b32 s7, v3 6795; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 6796; GFX940-NEXT: s_nop 0 6797; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 6798; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 6799; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 6800; GFX940-NEXT: s_waitcnt vmcnt(0) 6801; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 6802; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 6803; GFX940-NEXT: ; implicit-def: $vgpr4 6804; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 6805; GFX940-NEXT: s_cbranch_execnz .LBB21_1 6806; GFX940-NEXT: ; %bb.2: 6807; GFX940-NEXT: s_mov_b64 exec, s[2:3] 6808; GFX940-NEXT: s_waitcnt vmcnt(0) 6809; GFX940-NEXT: v_mov_b32_e32 v0, v5 6810; GFX940-NEXT: buffer_inv sc1 6811; GFX940-NEXT: s_setpc_b64 s[30:31] 6812; 6813; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6814; GFX11: ; %bb.0: 6815; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6816; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 6817; GFX11-NEXT: s_mov_b32 s1, 0 6818; GFX11-NEXT: s_mov_b32 s2, exec_lo 6819; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6820; GFX11-NEXT: v_readfirstlane_b32 s4, v0 6821; GFX11-NEXT: v_readfirstlane_b32 s5, v1 6822; GFX11-NEXT: v_readfirstlane_b32 s6, v2 6823; GFX11-NEXT: v_readfirstlane_b32 s7, v3 6824; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6825; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6826; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6827; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 6828; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 6829; GFX11-NEXT: s_and_saveexec_b32 s0, s0 6830; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 6831; GFX11-NEXT: ; implicit-def: $vgpr4 6832; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6833; GFX11-NEXT: s_cbranch_execnz .LBB21_1 6834; GFX11-NEXT: ; %bb.2: 6835; GFX11-NEXT: s_mov_b32 exec_lo, s2 6836; GFX11-NEXT: .p2align 6 6837; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start 6838; GFX11-NEXT: ; =>This Loop Header: Depth=1 6839; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 6840; GFX11-NEXT: s_waitcnt vmcnt(0) 6841; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 6842; GFX11-NEXT: s_mov_b32 s2, exec_lo 6843; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6844; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6845; GFX11-NEXT: v_mov_b32_e32 v6, v7 6846; GFX11-NEXT: v_mov_b32_e32 v7, v8 6847; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 6848; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 6849; GFX11-NEXT: v_readfirstlane_b32 s4, v0 6850; GFX11-NEXT: v_readfirstlane_b32 s5, v1 6851; GFX11-NEXT: v_readfirstlane_b32 s6, v2 6852; GFX11-NEXT: v_readfirstlane_b32 s7, v3 6853; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6854; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6855; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6856; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 6857; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 6858; GFX11-NEXT: s_and_saveexec_b32 s0, s0 6859; GFX11-NEXT: s_waitcnt vmcnt(0) 6860; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc 6861; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6862; GFX11-NEXT: s_cbranch_execnz .LBB21_4 6863; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 6864; GFX11-NEXT: s_mov_b32 exec_lo, s2 6865; GFX11-NEXT: s_waitcnt vmcnt(0) 6866; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 6867; GFX11-NEXT: v_mov_b32_e32 v8, v6 6868; GFX11-NEXT: buffer_gl1_inv 6869; GFX11-NEXT: buffer_gl0_inv 6870; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 6871; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6872; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6873; GFX11-NEXT: s_cbranch_execnz .LBB21_3 6874; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 6875; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 6876; GFX11-NEXT: v_mov_b32_e32 v0, v6 6877; GFX11-NEXT: s_setpc_b64 s[30:31] 6878; 6879; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6880; GFX10: ; %bb.0: 6881; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6882; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 6883; GFX10-NEXT: s_mov_b32 s5, 0 6884; GFX10-NEXT: s_mov_b32 s6, exec_lo 6885; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6886; GFX10-NEXT: v_readfirstlane_b32 s8, v0 6887; GFX10-NEXT: v_readfirstlane_b32 s9, v1 6888; GFX10-NEXT: v_readfirstlane_b32 s10, v2 6889; GFX10-NEXT: v_readfirstlane_b32 s11, v3 6890; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 6891; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 6892; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 6893; GFX10-NEXT: s_and_saveexec_b32 s4, s4 6894; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 6895; GFX10-NEXT: ; implicit-def: $vgpr4 6896; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6897; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 6898; GFX10-NEXT: s_cbranch_execnz .LBB21_1 6899; GFX10-NEXT: ; %bb.2: 6900; GFX10-NEXT: s_mov_b32 exec_lo, s6 6901; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start 6902; GFX10-NEXT: ; =>This Loop Header: Depth=1 6903; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 6904; GFX10-NEXT: s_waitcnt vmcnt(0) 6905; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 6906; GFX10-NEXT: s_mov_b32 s6, exec_lo 6907; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6908; GFX10-NEXT: v_mov_b32_e32 v6, v7 6909; GFX10-NEXT: v_mov_b32_e32 v7, v8 6910; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 6911; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 6912; GFX10-NEXT: v_readfirstlane_b32 s8, v0 6913; GFX10-NEXT: v_readfirstlane_b32 s9, v1 6914; GFX10-NEXT: v_readfirstlane_b32 s10, v2 6915; GFX10-NEXT: v_readfirstlane_b32 s11, v3 6916; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 6917; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 6918; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 6919; GFX10-NEXT: s_and_saveexec_b32 s4, s4 6920; GFX10-NEXT: s_waitcnt vmcnt(0) 6921; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 6922; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6923; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 6924; GFX10-NEXT: s_cbranch_execnz .LBB21_4 6925; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 6926; GFX10-NEXT: s_mov_b32 exec_lo, s6 6927; GFX10-NEXT: s_waitcnt vmcnt(0) 6928; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 6929; GFX10-NEXT: v_mov_b32_e32 v8, v6 6930; GFX10-NEXT: buffer_gl1_inv 6931; GFX10-NEXT: buffer_gl0_inv 6932; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 6933; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6934; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 6935; GFX10-NEXT: s_cbranch_execnz .LBB21_3 6936; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 6937; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 6938; GFX10-NEXT: v_mov_b32_e32 v0, v6 6939; GFX10-NEXT: s_setpc_b64 s[30:31] 6940; 6941; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6942; GFX90A: ; %bb.0: 6943; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6944; GFX90A-NEXT: s_mov_b64 s[6:7], exec 6945; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6946; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 6947; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 6948; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 6949; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 6950; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6951; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6952; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6953; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6954; GFX90A-NEXT: s_waitcnt vmcnt(0) 6955; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc 6956; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 6957; GFX90A-NEXT: ; implicit-def: $vgpr4 6958; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 6959; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 6960; GFX90A-NEXT: ; %bb.2: 6961; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 6962; GFX90A-NEXT: s_waitcnt vmcnt(0) 6963; GFX90A-NEXT: v_mov_b32_e32 v0, v5 6964; GFX90A-NEXT: buffer_wbinvl1 6965; GFX90A-NEXT: s_setpc_b64 s[30:31] 6966; 6967; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6968; GFX908: ; %bb.0: 6969; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6970; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 6971; GFX908-NEXT: s_mov_b64 s[6:7], exec 6972; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 6973; GFX908-NEXT: v_readfirstlane_b32 s8, v0 6974; GFX908-NEXT: v_readfirstlane_b32 s9, v1 6975; GFX908-NEXT: v_readfirstlane_b32 s10, v2 6976; GFX908-NEXT: v_readfirstlane_b32 s11, v3 6977; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6978; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6979; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6980; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6981; GFX908-NEXT: s_nop 0 6982; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 6983; GFX908-NEXT: ; implicit-def: $vgpr4 6984; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 6985; GFX908-NEXT: s_cbranch_execnz .LBB21_1 6986; GFX908-NEXT: ; %bb.2: 6987; GFX908-NEXT: s_mov_b64 exec, s[6:7] 6988; GFX908-NEXT: s_mov_b64 s[6:7], 0 6989; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start 6990; GFX908-NEXT: ; =>This Loop Header: Depth=1 6991; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 6992; GFX908-NEXT: s_waitcnt vmcnt(0) 6993; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 6994; GFX908-NEXT: v_mov_b32_e32 v6, v7 6995; GFX908-NEXT: s_mov_b64 s[12:13], exec 6996; GFX908-NEXT: v_mov_b32_e32 v7, v8 6997; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 6998; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 6999; GFX908-NEXT: v_readfirstlane_b32 s8, v0 7000; GFX908-NEXT: v_readfirstlane_b32 s9, v1 7001; GFX908-NEXT: v_readfirstlane_b32 s10, v2 7002; GFX908-NEXT: v_readfirstlane_b32 s11, v3 7003; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7004; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7005; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7006; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7007; GFX908-NEXT: s_waitcnt vmcnt(0) 7008; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 7009; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 7010; GFX908-NEXT: s_cbranch_execnz .LBB21_4 7011; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7012; GFX908-NEXT: s_mov_b64 exec, s[12:13] 7013; GFX908-NEXT: s_waitcnt vmcnt(0) 7014; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 7015; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7016; GFX908-NEXT: v_mov_b32_e32 v8, v6 7017; GFX908-NEXT: buffer_wbinvl1 7018; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7019; GFX908-NEXT: s_cbranch_execnz .LBB21_3 7020; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 7021; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7022; GFX908-NEXT: v_mov_b32_e32 v0, v6 7023; GFX908-NEXT: s_setpc_b64 s[30:31] 7024; 7025; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 7026; GFX8: ; %bb.0: 7027; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7028; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 7029; GFX8-NEXT: s_mov_b64 s[6:7], exec 7030; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7031; GFX8-NEXT: v_readfirstlane_b32 s8, v0 7032; GFX8-NEXT: v_readfirstlane_b32 s9, v1 7033; GFX8-NEXT: v_readfirstlane_b32 s10, v2 7034; GFX8-NEXT: v_readfirstlane_b32 s11, v3 7035; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7036; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7037; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7038; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7039; GFX8-NEXT: s_nop 0 7040; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 7041; GFX8-NEXT: ; implicit-def: $vgpr4 7042; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 7043; GFX8-NEXT: s_cbranch_execnz .LBB21_1 7044; GFX8-NEXT: ; %bb.2: 7045; GFX8-NEXT: s_mov_b64 exec, s[6:7] 7046; GFX8-NEXT: s_mov_b64 s[6:7], 0 7047; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start 7048; GFX8-NEXT: ; =>This Loop Header: Depth=1 7049; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 7050; GFX8-NEXT: s_waitcnt vmcnt(0) 7051; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8 7052; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 7053; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 7054; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 7055; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 7056; GFX8-NEXT: v_mov_b32_e32 v6, v7 7057; GFX8-NEXT: s_mov_b64 s[12:13], exec 7058; GFX8-NEXT: v_mov_b32_e32 v7, v8 7059; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7060; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 7061; GFX8-NEXT: v_readfirstlane_b32 s8, v0 7062; GFX8-NEXT: v_readfirstlane_b32 s9, v1 7063; GFX8-NEXT: v_readfirstlane_b32 s10, v2 7064; GFX8-NEXT: v_readfirstlane_b32 s11, v3 7065; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7066; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7067; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7068; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7069; GFX8-NEXT: s_waitcnt vmcnt(0) 7070; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc 7071; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 7072; GFX8-NEXT: s_cbranch_execnz .LBB21_4 7073; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7074; GFX8-NEXT: s_mov_b64 exec, s[12:13] 7075; GFX8-NEXT: s_waitcnt vmcnt(0) 7076; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 7077; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7078; GFX8-NEXT: v_mov_b32_e32 v8, v6 7079; GFX8-NEXT: buffer_wbinvl1 7080; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 7081; GFX8-NEXT: s_cbranch_execnz .LBB21_3 7082; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 7083; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 7084; GFX8-NEXT: v_mov_b32_e32 v0, v6 7085; GFX8-NEXT: s_setpc_b64 s[30:31] 7086; 7087; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 7088; GFX7: ; %bb.0: 7089; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7090; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 7091; GFX7-NEXT: s_mov_b64 s[6:7], exec 7092; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7093; GFX7-NEXT: v_readfirstlane_b32 s8, v0 7094; GFX7-NEXT: v_readfirstlane_b32 s9, v1 7095; GFX7-NEXT: v_readfirstlane_b32 s10, v2 7096; GFX7-NEXT: v_readfirstlane_b32 s11, v3 7097; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7098; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7099; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7100; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7101; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 7102; GFX7-NEXT: ; implicit-def: $vgpr4 7103; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 7104; GFX7-NEXT: s_cbranch_execnz .LBB21_1 7105; GFX7-NEXT: ; %bb.2: 7106; GFX7-NEXT: s_mov_b64 exec, s[6:7] 7107; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 7108; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 7109; GFX7-NEXT: s_waitcnt vmcnt(0) 7110; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 7111; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 7112; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 7113; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 7114; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 7115; GFX7-NEXT: s_mov_b64 s[6:7], 0 7116; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start 7117; GFX7-NEXT: ; =>This Loop Header: Depth=1 7118; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 7119; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 7120; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 7121; GFX7-NEXT: s_mov_b64 s[12:13], exec 7122; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 7123; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 7124; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 7125; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 7126; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 7127; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 7128; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 7129; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 7130; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 7131; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 7132; GFX7-NEXT: v_mov_b32_e32 v8, v6 7133; GFX7-NEXT: v_mov_b32_e32 v7, v5 7134; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7135; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 7136; GFX7-NEXT: v_readfirstlane_b32 s8, v0 7137; GFX7-NEXT: v_readfirstlane_b32 s9, v1 7138; GFX7-NEXT: v_readfirstlane_b32 s10, v2 7139; GFX7-NEXT: v_readfirstlane_b32 s11, v3 7140; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7141; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7142; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7143; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7144; GFX7-NEXT: s_waitcnt vmcnt(0) 7145; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc 7146; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 7147; GFX7-NEXT: s_cbranch_execnz .LBB21_4 7148; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7149; GFX7-NEXT: s_mov_b64 exec, s[12:13] 7150; GFX7-NEXT: s_waitcnt vmcnt(0) 7151; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 7152; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 7153; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 7154; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 7155; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7156; GFX7-NEXT: buffer_wbinvl1 7157; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 7158; GFX7-NEXT: s_cbranch_execnz .LBB21_3 7159; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 7160; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 7161; GFX7-NEXT: v_mov_b32_e32 v0, v4 7162; GFX7-NEXT: v_mov_b32_e32 v1, v5 7163; GFX7-NEXT: s_setpc_b64 s[30:31] 7164; 7165; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 7166; GFX6: ; %bb.0: 7167; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7168; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 7169; GFX6-NEXT: s_mov_b64 s[6:7], exec 7170; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7171; GFX6-NEXT: v_readfirstlane_b32 s8, v0 7172; GFX6-NEXT: v_readfirstlane_b32 s9, v1 7173; GFX6-NEXT: v_readfirstlane_b32 s10, v2 7174; GFX6-NEXT: v_readfirstlane_b32 s11, v3 7175; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7176; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7177; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7178; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7179; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 7180; GFX6-NEXT: ; implicit-def: $vgpr4 7181; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 7182; GFX6-NEXT: s_cbranch_execnz .LBB21_1 7183; GFX6-NEXT: ; %bb.2: 7184; GFX6-NEXT: s_mov_b64 exec, s[6:7] 7185; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 7186; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 7187; GFX6-NEXT: s_waitcnt vmcnt(0) 7188; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 7189; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 7190; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 7191; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 7192; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 7193; GFX6-NEXT: s_mov_b64 s[6:7], 0 7194; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start 7195; GFX6-NEXT: ; =>This Loop Header: Depth=1 7196; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 7197; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 7198; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 7199; GFX6-NEXT: s_mov_b64 s[12:13], exec 7200; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 7201; GFX6-NEXT: s_waitcnt expcnt(0) 7202; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 7203; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 7204; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 7205; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 7206; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 7207; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 7208; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 7209; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 7210; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 7211; GFX6-NEXT: v_mov_b32_e32 v8, v6 7212; GFX6-NEXT: v_mov_b32_e32 v7, v5 7213; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7214; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 7215; GFX6-NEXT: v_readfirstlane_b32 s8, v0 7216; GFX6-NEXT: v_readfirstlane_b32 s9, v1 7217; GFX6-NEXT: v_readfirstlane_b32 s10, v2 7218; GFX6-NEXT: v_readfirstlane_b32 s11, v3 7219; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7220; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7221; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7222; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7223; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7224; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc 7225; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 7226; GFX6-NEXT: s_cbranch_execnz .LBB21_4 7227; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7228; GFX6-NEXT: s_mov_b64 exec, s[12:13] 7229; GFX6-NEXT: s_waitcnt vmcnt(0) 7230; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 7231; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 7232; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 7233; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 7234; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7235; GFX6-NEXT: buffer_wbinvl1 7236; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 7237; GFX6-NEXT: s_cbranch_execnz .LBB21_3 7238; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 7239; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 7240; GFX6-NEXT: v_mov_b32_e32 v0, v4 7241; GFX6-NEXT: v_mov_b32_e32 v1, v5 7242; GFX6-NEXT: s_waitcnt expcnt(0) 7243; GFX6-NEXT: s_setpc_b64 s[30:31] 7244 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 7245 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7246 ret <2 x half> %result 7247} 7248 7249define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { 7250; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7251; GFX12: ; %bb.0: 7252; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7253; GFX12-NEXT: s_wait_expcnt 0x0 7254; GFX12-NEXT: s_wait_samplecnt 0x0 7255; GFX12-NEXT: s_wait_bvhcnt 0x0 7256; GFX12-NEXT: s_wait_kmcnt 0x0 7257; GFX12-NEXT: v_mov_b32_e32 v1, s16 7258; GFX12-NEXT: s_wait_storecnt 0x0 7259; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 7260; GFX12-NEXT: s_wait_loadcnt 0x0 7261; GFX12-NEXT: global_inv scope:SCOPE_DEV 7262; GFX12-NEXT: s_setpc_b64 s[30:31] 7263; 7264; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7265; GFX940: ; %bb.0: 7266; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7267; GFX940-NEXT: v_mov_b32_e32 v1, s16 7268; GFX940-NEXT: buffer_wbl2 sc1 7269; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 7270; GFX940-NEXT: s_waitcnt vmcnt(0) 7271; GFX940-NEXT: buffer_inv sc1 7272; GFX940-NEXT: s_setpc_b64 s[30:31] 7273; 7274; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7275; GFX11: ; %bb.0: 7276; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7277; GFX11-NEXT: s_add_i32 s4, s16, 0x400 7278; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7279; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 7280; GFX11-NEXT: v_mov_b32_e32 v0, s16 7281; GFX11-NEXT: s_mov_b32 s4, 0 7282; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 7283; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start 7284; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7285; GFX11-NEXT: s_waitcnt vmcnt(0) 7286; GFX11-NEXT: v_mov_b32_e32 v5, v0 7287; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7288; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7289; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 7290; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 7291; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 7292; GFX11-NEXT: s_waitcnt vmcnt(0) 7293; GFX11-NEXT: buffer_gl1_inv 7294; GFX11-NEXT: buffer_gl0_inv 7295; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 7296; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 7297; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7298; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 7299; GFX11-NEXT: s_cbranch_execnz .LBB22_1 7300; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7301; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 7302; GFX11-NEXT: s_setpc_b64 s[30:31] 7303; 7304; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7305; GFX10: ; %bb.0: 7306; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7307; GFX10-NEXT: v_mov_b32_e32 v2, v0 7308; GFX10-NEXT: v_mov_b32_e32 v0, s20 7309; GFX10-NEXT: s_add_i32 s4, s20, 0x400 7310; GFX10-NEXT: v_mov_b32_e32 v3, s4 7311; GFX10-NEXT: s_mov_b32 s4, 0 7312; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7313; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start 7314; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7315; GFX10-NEXT: s_waitcnt vmcnt(0) 7316; GFX10-NEXT: v_mov_b32_e32 v5, v0 7317; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7318; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 7319; GFX10-NEXT: v_mov_b32_e32 v0, v4 7320; GFX10-NEXT: v_mov_b32_e32 v1, v5 7321; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7322; GFX10-NEXT: s_waitcnt vmcnt(0) 7323; GFX10-NEXT: buffer_gl1_inv 7324; GFX10-NEXT: buffer_gl0_inv 7325; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 7326; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7327; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7328; GFX10-NEXT: s_cbranch_execnz .LBB22_1 7329; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7330; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7331; GFX10-NEXT: s_setpc_b64 s[30:31] 7332; 7333; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7334; GFX90A: ; %bb.0: 7335; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7336; GFX90A-NEXT: v_mov_b32_e32 v2, v0 7337; GFX90A-NEXT: v_mov_b32_e32 v0, s20 7338; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7339; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 7340; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7341; GFX90A-NEXT: v_mov_b32_e32 v3, s6 7342; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start 7343; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7344; GFX90A-NEXT: s_waitcnt vmcnt(0) 7345; GFX90A-NEXT: v_mov_b32_e32 v5, v0 7346; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 7347; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 7348; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7349; GFX90A-NEXT: s_waitcnt vmcnt(0) 7350; GFX90A-NEXT: buffer_wbinvl1 7351; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7352; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7353; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7354; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 7355; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7356; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7357; GFX90A-NEXT: s_setpc_b64 s[30:31] 7358; 7359; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7360; GFX908: ; %bb.0: 7361; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7362; GFX908-NEXT: v_mov_b32_e32 v2, v0 7363; GFX908-NEXT: v_mov_b32_e32 v0, s20 7364; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7365; GFX908-NEXT: s_add_i32 s6, s20, 0x400 7366; GFX908-NEXT: s_mov_b64 s[4:5], 0 7367; GFX908-NEXT: v_mov_b32_e32 v3, s6 7368; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start 7369; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7370; GFX908-NEXT: s_waitcnt vmcnt(0) 7371; GFX908-NEXT: v_mov_b32_e32 v5, v0 7372; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 7373; GFX908-NEXT: v_mov_b32_e32 v0, v4 7374; GFX908-NEXT: v_mov_b32_e32 v1, v5 7375; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7376; GFX908-NEXT: s_waitcnt vmcnt(0) 7377; GFX908-NEXT: buffer_wbinvl1 7378; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7379; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7380; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7381; GFX908-NEXT: s_cbranch_execnz .LBB22_1 7382; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7383; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7384; GFX908-NEXT: s_setpc_b64 s[30:31] 7385; 7386; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7387; GFX8: ; %bb.0: 7388; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7389; GFX8-NEXT: v_mov_b32_e32 v2, v0 7390; GFX8-NEXT: v_mov_b32_e32 v0, s20 7391; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7392; GFX8-NEXT: s_add_i32 s6, s20, 0x400 7393; GFX8-NEXT: s_mov_b64 s[4:5], 0 7394; GFX8-NEXT: v_mov_b32_e32 v3, s6 7395; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start 7396; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7397; GFX8-NEXT: s_waitcnt vmcnt(0) 7398; GFX8-NEXT: v_mov_b32_e32 v5, v0 7399; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 7400; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 7401; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 7402; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 7403; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 7404; GFX8-NEXT: v_mov_b32_e32 v0, v4 7405; GFX8-NEXT: v_mov_b32_e32 v1, v5 7406; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7407; GFX8-NEXT: s_waitcnt vmcnt(0) 7408; GFX8-NEXT: buffer_wbinvl1 7409; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7410; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7411; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7412; GFX8-NEXT: s_cbranch_execnz .LBB22_1 7413; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7414; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7415; GFX8-NEXT: s_setpc_b64 s[30:31] 7416; 7417; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7418; GFX7: ; %bb.0: 7419; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7420; GFX7-NEXT: v_mov_b32_e32 v2, s20 7421; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 7422; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 7423; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 7424; GFX7-NEXT: s_add_i32 s6, s20, 0x400 7425; GFX7-NEXT: s_mov_b64 s[4:5], 0 7426; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 7427; GFX7-NEXT: s_waitcnt vmcnt(0) 7428; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 7429; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 7430; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 7431; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 7432; GFX7-NEXT: v_mov_b32_e32 v4, s6 7433; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start 7434; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7435; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 7436; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 7437; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 7438; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 7439; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 7440; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 7441; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 7442; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 7443; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 7444; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 7445; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 7446; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 7447; GFX7-NEXT: v_mov_b32_e32 v8, v6 7448; GFX7-NEXT: v_mov_b32_e32 v7, v5 7449; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 7450; GFX7-NEXT: s_waitcnt vmcnt(0) 7451; GFX7-NEXT: buffer_wbinvl1 7452; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 7453; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 7454; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 7455; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 7456; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7457; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7458; GFX7-NEXT: s_cbranch_execnz .LBB22_1 7459; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7460; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7461; GFX7-NEXT: s_setpc_b64 s[30:31] 7462; 7463; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: 7464; GFX6: ; %bb.0: 7465; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7466; GFX6-NEXT: v_mov_b32_e32 v2, s20 7467; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 7468; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 7469; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 7470; GFX6-NEXT: s_add_i32 s6, s20, 0x400 7471; GFX6-NEXT: s_mov_b64 s[4:5], 0 7472; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 7473; GFX6-NEXT: s_waitcnt vmcnt(0) 7474; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 7475; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 7476; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 7477; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 7478; GFX6-NEXT: v_mov_b32_e32 v4, s6 7479; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start 7480; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 7481; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 7482; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 7483; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 7484; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 7485; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 7486; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 7487; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 7488; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 7489; GFX6-NEXT: s_waitcnt expcnt(0) 7490; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 7491; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 7492; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 7493; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 7494; GFX6-NEXT: v_mov_b32_e32 v8, v6 7495; GFX6-NEXT: v_mov_b32_e32 v7, v5 7496; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 7497; GFX6-NEXT: s_waitcnt vmcnt(0) 7498; GFX6-NEXT: buffer_wbinvl1 7499; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 7500; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 7501; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 7502; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 7503; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7504; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 7505; GFX6-NEXT: s_cbranch_execnz .LBB22_1 7506; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 7507; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 7508; GFX6-NEXT: s_waitcnt expcnt(0) 7509; GFX6-NEXT: s_setpc_b64 s[30:31] 7510 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 7511 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst 7512 ret <2 x half> %result 7513} 7514 7515define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { 7516; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7517; GFX12: ; %bb.0: 7518; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7519; GFX12-NEXT: s_wait_expcnt 0x0 7520; GFX12-NEXT: s_wait_samplecnt 0x0 7521; GFX12-NEXT: s_wait_bvhcnt 0x0 7522; GFX12-NEXT: s_wait_kmcnt 0x0 7523; GFX12-NEXT: v_mov_b32_e32 v1, s16 7524; GFX12-NEXT: s_wait_storecnt 0x0 7525; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 7526; GFX12-NEXT: s_wait_storecnt 0x0 7527; GFX12-NEXT: global_inv scope:SCOPE_DEV 7528; GFX12-NEXT: s_setpc_b64 s[30:31] 7529; 7530; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7531; GFX940: ; %bb.0: 7532; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7533; GFX940-NEXT: v_mov_b32_e32 v1, s16 7534; GFX940-NEXT: buffer_wbl2 sc1 7535; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 7536; GFX940-NEXT: s_waitcnt vmcnt(0) 7537; GFX940-NEXT: buffer_inv sc1 7538; GFX940-NEXT: s_setpc_b64 s[30:31] 7539; 7540; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7541; GFX11: ; %bb.0: 7542; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7543; GFX11-NEXT: v_mov_b32_e32 v1, s16 7544; GFX11-NEXT: s_add_i32 s4, s16, 0x400 7545; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7546; GFX11-NEXT: v_mov_b32_e32 v3, s4 7547; GFX11-NEXT: s_mov_b32 s4, 0 7548; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 7549; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start 7550; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7551; GFX11-NEXT: s_waitcnt vmcnt(0) 7552; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 7553; GFX11-NEXT: v_mov_b32_e32 v5, v2 7554; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7555; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 7556; GFX11-NEXT: v_mov_b32_e32 v4, v1 7557; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc 7558; GFX11-NEXT: s_waitcnt vmcnt(0) 7559; GFX11-NEXT: buffer_gl1_inv 7560; GFX11-NEXT: buffer_gl0_inv 7561; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 7562; GFX11-NEXT: v_mov_b32_e32 v2, v4 7563; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 7564; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7565; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 7566; GFX11-NEXT: s_cbranch_execnz .LBB23_1 7567; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7568; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 7569; GFX11-NEXT: s_setpc_b64 s[30:31] 7570; 7571; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7572; GFX10: ; %bb.0: 7573; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7574; GFX10-NEXT: v_mov_b32_e32 v1, s20 7575; GFX10-NEXT: s_add_i32 s4, s20, 0x400 7576; GFX10-NEXT: v_mov_b32_e32 v3, s4 7577; GFX10-NEXT: s_mov_b32 s4, 0 7578; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 7579; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start 7580; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7581; GFX10-NEXT: s_waitcnt vmcnt(0) 7582; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 7583; GFX10-NEXT: v_mov_b32_e32 v5, v2 7584; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7585; GFX10-NEXT: v_mov_b32_e32 v4, v1 7586; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 7587; GFX10-NEXT: s_waitcnt vmcnt(0) 7588; GFX10-NEXT: buffer_gl1_inv 7589; GFX10-NEXT: buffer_gl0_inv 7590; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 7591; GFX10-NEXT: v_mov_b32_e32 v2, v4 7592; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7593; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7594; GFX10-NEXT: s_cbranch_execnz .LBB23_1 7595; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7596; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7597; GFX10-NEXT: s_setpc_b64 s[30:31] 7598; 7599; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7600; GFX90A: ; %bb.0: 7601; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7602; GFX90A-NEXT: v_mov_b32_e32 v1, s20 7603; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 7604; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 7605; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7606; GFX90A-NEXT: v_mov_b32_e32 v1, s6 7607; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start 7608; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7609; GFX90A-NEXT: s_waitcnt vmcnt(0) 7610; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 7611; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 7612; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc 7613; GFX90A-NEXT: s_waitcnt vmcnt(0) 7614; GFX90A-NEXT: buffer_wbinvl1 7615; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 7616; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7617; GFX90A-NEXT: v_mov_b32_e32 v3, v4 7618; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7619; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 7620; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7621; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7622; GFX90A-NEXT: s_setpc_b64 s[30:31] 7623; 7624; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7625; GFX908: ; %bb.0: 7626; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7627; GFX908-NEXT: v_mov_b32_e32 v1, s20 7628; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 7629; GFX908-NEXT: s_add_i32 s6, s20, 0x400 7630; GFX908-NEXT: s_mov_b64 s[4:5], 0 7631; GFX908-NEXT: v_mov_b32_e32 v3, s6 7632; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start 7633; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7634; GFX908-NEXT: s_waitcnt vmcnt(0) 7635; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 7636; GFX908-NEXT: v_mov_b32_e32 v5, v2 7637; GFX908-NEXT: v_mov_b32_e32 v4, v1 7638; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 7639; GFX908-NEXT: s_waitcnt vmcnt(0) 7640; GFX908-NEXT: buffer_wbinvl1 7641; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 7642; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7643; GFX908-NEXT: v_mov_b32_e32 v2, v4 7644; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7645; GFX908-NEXT: s_cbranch_execnz .LBB23_1 7646; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7647; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7648; GFX908-NEXT: s_setpc_b64 s[30:31] 7649; 7650; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7651; GFX8: ; %bb.0: 7652; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7653; GFX8-NEXT: v_mov_b32_e32 v1, s20 7654; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 7655; GFX8-NEXT: s_add_i32 s6, s20, 0x400 7656; GFX8-NEXT: s_mov_b64 s[4:5], 0 7657; GFX8-NEXT: v_mov_b32_e32 v3, s6 7658; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start 7659; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7660; GFX8-NEXT: s_waitcnt vmcnt(0) 7661; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 7662; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 7663; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 7664; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 7665; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 7666; GFX8-NEXT: v_mov_b32_e32 v5, v2 7667; GFX8-NEXT: v_mov_b32_e32 v4, v1 7668; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 7669; GFX8-NEXT: s_waitcnt vmcnt(0) 7670; GFX8-NEXT: buffer_wbinvl1 7671; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 7672; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7673; GFX8-NEXT: v_mov_b32_e32 v2, v4 7674; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7675; GFX8-NEXT: s_cbranch_execnz .LBB23_1 7676; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7677; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7678; GFX8-NEXT: s_setpc_b64 s[30:31] 7679; 7680; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7681; GFX7: ; %bb.0: 7682; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7683; GFX7-NEXT: v_mov_b32_e32 v2, s20 7684; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 7685; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 7686; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 7687; GFX7-NEXT: s_add_i32 s6, s20, 0x400 7688; GFX7-NEXT: s_mov_b64 s[4:5], 0 7689; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 7690; GFX7-NEXT: s_waitcnt vmcnt(0) 7691; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 7692; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 7693; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 7694; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 7695; GFX7-NEXT: v_mov_b32_e32 v2, s6 7696; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start 7697; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7698; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 7699; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 7700; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 7701; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 7702; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 7703; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 7704; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 7705; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 7706; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 7707; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 7708; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 7709; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 7710; GFX7-NEXT: v_mov_b32_e32 v7, v5 7711; GFX7-NEXT: v_mov_b32_e32 v6, v4 7712; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 7713; GFX7-NEXT: s_waitcnt vmcnt(0) 7714; GFX7-NEXT: buffer_wbinvl1 7715; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 7716; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 7717; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 7718; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 7719; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7720; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7721; GFX7-NEXT: s_cbranch_execnz .LBB23_1 7722; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7723; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7724; GFX7-NEXT: s_setpc_b64 s[30:31] 7725; 7726; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: 7727; GFX6: ; %bb.0: 7728; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7729; GFX6-NEXT: v_mov_b32_e32 v2, s20 7730; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 7731; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 7732; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 7733; GFX6-NEXT: s_add_i32 s6, s20, 0x400 7734; GFX6-NEXT: s_mov_b64 s[4:5], 0 7735; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 7736; GFX6-NEXT: s_waitcnt vmcnt(0) 7737; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 7738; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 7739; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 7740; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 7741; GFX6-NEXT: v_mov_b32_e32 v2, s6 7742; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start 7743; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 7744; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 7745; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 7746; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 7747; GFX6-NEXT: s_waitcnt expcnt(0) 7748; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 7749; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 7750; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 7751; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 7752; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 7753; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 7754; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 7755; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 7756; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 7757; GFX6-NEXT: v_mov_b32_e32 v7, v5 7758; GFX6-NEXT: v_mov_b32_e32 v6, v4 7759; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 7760; GFX6-NEXT: s_waitcnt vmcnt(0) 7761; GFX6-NEXT: buffer_wbinvl1 7762; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 7763; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 7764; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 7765; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 7766; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7767; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 7768; GFX6-NEXT: s_cbranch_execnz .LBB23_1 7769; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 7770; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 7771; GFX6-NEXT: s_waitcnt expcnt(0) 7772; GFX6-NEXT: s_setpc_b64 s[30:31] 7773 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 7774 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst 7775 ret void 7776} 7777 7778define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 7779; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7780; GFX12: ; %bb.0: 7781; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7782; GFX12-NEXT: s_wait_expcnt 0x0 7783; GFX12-NEXT: s_wait_samplecnt 0x0 7784; GFX12-NEXT: s_wait_bvhcnt 0x0 7785; GFX12-NEXT: s_wait_kmcnt 0x0 7786; GFX12-NEXT: v_mov_b32_e32 v1, s16 7787; GFX12-NEXT: s_wait_storecnt 0x0 7788; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 7789; GFX12-NEXT: s_wait_loadcnt 0x0 7790; GFX12-NEXT: global_inv scope:SCOPE_DEV 7791; GFX12-NEXT: s_setpc_b64 s[30:31] 7792; 7793; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7794; GFX940: ; %bb.0: 7795; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7796; GFX940-NEXT: v_mov_b32_e32 v1, s16 7797; GFX940-NEXT: buffer_wbl2 sc1 7798; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 7799; GFX940-NEXT: s_waitcnt vmcnt(0) 7800; GFX940-NEXT: buffer_inv sc1 7801; GFX940-NEXT: s_setpc_b64 s[30:31] 7802; 7803; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7804; GFX11: ; %bb.0: 7805; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7806; GFX11-NEXT: s_add_i32 s4, s16, 0x400 7807; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7808; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 7809; GFX11-NEXT: v_mov_b32_e32 v0, s16 7810; GFX11-NEXT: s_mov_b32 s4, 0 7811; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 7812; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start 7813; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7814; GFX11-NEXT: s_waitcnt vmcnt(0) 7815; GFX11-NEXT: v_mov_b32_e32 v5, v0 7816; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7817; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7818; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 7819; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 7820; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 7821; GFX11-NEXT: s_waitcnt vmcnt(0) 7822; GFX11-NEXT: buffer_gl1_inv 7823; GFX11-NEXT: buffer_gl0_inv 7824; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 7825; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 7826; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7827; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 7828; GFX11-NEXT: s_cbranch_execnz .LBB24_1 7829; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7830; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 7831; GFX11-NEXT: s_setpc_b64 s[30:31] 7832; 7833; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7834; GFX10: ; %bb.0: 7835; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7836; GFX10-NEXT: v_mov_b32_e32 v2, v0 7837; GFX10-NEXT: v_mov_b32_e32 v0, s20 7838; GFX10-NEXT: s_add_i32 s4, s20, 0x400 7839; GFX10-NEXT: v_mov_b32_e32 v3, s4 7840; GFX10-NEXT: s_mov_b32 s4, 0 7841; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7842; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start 7843; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7844; GFX10-NEXT: s_waitcnt vmcnt(0) 7845; GFX10-NEXT: v_mov_b32_e32 v5, v0 7846; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7847; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 7848; GFX10-NEXT: v_mov_b32_e32 v0, v4 7849; GFX10-NEXT: v_mov_b32_e32 v1, v5 7850; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7851; GFX10-NEXT: s_waitcnt vmcnt(0) 7852; GFX10-NEXT: buffer_gl1_inv 7853; GFX10-NEXT: buffer_gl0_inv 7854; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 7855; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7856; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7857; GFX10-NEXT: s_cbranch_execnz .LBB24_1 7858; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7859; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7860; GFX10-NEXT: s_setpc_b64 s[30:31] 7861; 7862; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7863; GFX90A: ; %bb.0: 7864; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7865; GFX90A-NEXT: v_mov_b32_e32 v2, v0 7866; GFX90A-NEXT: v_mov_b32_e32 v0, s20 7867; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7868; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 7869; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7870; GFX90A-NEXT: v_mov_b32_e32 v3, s6 7871; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start 7872; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7873; GFX90A-NEXT: s_waitcnt vmcnt(0) 7874; GFX90A-NEXT: v_mov_b32_e32 v5, v0 7875; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 7876; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 7877; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7878; GFX90A-NEXT: s_waitcnt vmcnt(0) 7879; GFX90A-NEXT: buffer_wbinvl1 7880; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7881; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7882; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7883; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 7884; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7885; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7886; GFX90A-NEXT: s_setpc_b64 s[30:31] 7887; 7888; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7889; GFX908: ; %bb.0: 7890; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7891; GFX908-NEXT: v_mov_b32_e32 v2, v0 7892; GFX908-NEXT: v_mov_b32_e32 v0, s20 7893; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7894; GFX908-NEXT: s_add_i32 s6, s20, 0x400 7895; GFX908-NEXT: s_mov_b64 s[4:5], 0 7896; GFX908-NEXT: v_mov_b32_e32 v3, s6 7897; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start 7898; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7899; GFX908-NEXT: s_waitcnt vmcnt(0) 7900; GFX908-NEXT: v_mov_b32_e32 v5, v0 7901; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 7902; GFX908-NEXT: v_mov_b32_e32 v0, v4 7903; GFX908-NEXT: v_mov_b32_e32 v1, v5 7904; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7905; GFX908-NEXT: s_waitcnt vmcnt(0) 7906; GFX908-NEXT: buffer_wbinvl1 7907; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7908; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7909; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7910; GFX908-NEXT: s_cbranch_execnz .LBB24_1 7911; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7912; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7913; GFX908-NEXT: s_setpc_b64 s[30:31] 7914; 7915; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7916; GFX8: ; %bb.0: 7917; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7918; GFX8-NEXT: v_mov_b32_e32 v2, v0 7919; GFX8-NEXT: v_mov_b32_e32 v0, s20 7920; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 7921; GFX8-NEXT: s_add_i32 s6, s20, 0x400 7922; GFX8-NEXT: s_mov_b64 s[4:5], 0 7923; GFX8-NEXT: v_mov_b32_e32 v3, s6 7924; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start 7925; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7926; GFX8-NEXT: s_waitcnt vmcnt(0) 7927; GFX8-NEXT: v_mov_b32_e32 v5, v0 7928; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 7929; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 7930; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 7931; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 7932; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 7933; GFX8-NEXT: v_mov_b32_e32 v0, v4 7934; GFX8-NEXT: v_mov_b32_e32 v1, v5 7935; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 7936; GFX8-NEXT: s_waitcnt vmcnt(0) 7937; GFX8-NEXT: buffer_wbinvl1 7938; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 7939; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7940; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7941; GFX8-NEXT: s_cbranch_execnz .LBB24_1 7942; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7943; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7944; GFX8-NEXT: s_setpc_b64 s[30:31] 7945; 7946; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7947; GFX7: ; %bb.0: 7948; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7949; GFX7-NEXT: v_mov_b32_e32 v2, s20 7950; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 7951; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 7952; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 7953; GFX7-NEXT: s_add_i32 s6, s20, 0x400 7954; GFX7-NEXT: s_mov_b64 s[4:5], 0 7955; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 7956; GFX7-NEXT: s_waitcnt vmcnt(0) 7957; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 7958; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 7959; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 7960; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 7961; GFX7-NEXT: v_mov_b32_e32 v4, s6 7962; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start 7963; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7964; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 7965; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 7966; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 7967; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 7968; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 7969; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 7970; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 7971; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 7972; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 7973; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 7974; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 7975; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 7976; GFX7-NEXT: v_mov_b32_e32 v8, v6 7977; GFX7-NEXT: v_mov_b32_e32 v7, v5 7978; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 7979; GFX7-NEXT: s_waitcnt vmcnt(0) 7980; GFX7-NEXT: buffer_wbinvl1 7981; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 7982; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 7983; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 7984; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 7985; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7986; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7987; GFX7-NEXT: s_cbranch_execnz .LBB24_1 7988; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7989; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7990; GFX7-NEXT: s_setpc_b64 s[30:31] 7991; 7992; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: 7993; GFX6: ; %bb.0: 7994; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7995; GFX6-NEXT: v_mov_b32_e32 v2, s20 7996; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 7997; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 7998; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 7999; GFX6-NEXT: s_add_i32 s6, s20, 0x400 8000; GFX6-NEXT: s_mov_b64 s[4:5], 0 8001; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 8002; GFX6-NEXT: s_waitcnt vmcnt(0) 8003; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 8004; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 8005; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 8006; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 8007; GFX6-NEXT: v_mov_b32_e32 v4, s6 8008; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start 8009; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 8010; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 8011; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 8012; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 8013; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 8014; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 8015; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 8016; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 8017; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 8018; GFX6-NEXT: s_waitcnt expcnt(0) 8019; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 8020; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 8021; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 8022; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 8023; GFX6-NEXT: v_mov_b32_e32 v8, v6 8024; GFX6-NEXT: v_mov_b32_e32 v7, v5 8025; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 8026; GFX6-NEXT: s_waitcnt vmcnt(0) 8027; GFX6-NEXT: buffer_wbinvl1 8028; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 8029; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 8030; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 8031; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 8032; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8033; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 8034; GFX6-NEXT: s_cbranch_execnz .LBB24_1 8035; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 8036; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 8037; GFX6-NEXT: s_waitcnt expcnt(0) 8038; GFX6-NEXT: s_setpc_b64 s[30:31] 8039 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 8040 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 8041 ret <2 x half> %result 8042} 8043 8044define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 8045; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8046; GFX12: ; %bb.0: 8047; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8048; GFX12-NEXT: s_wait_expcnt 0x0 8049; GFX12-NEXT: s_wait_samplecnt 0x0 8050; GFX12-NEXT: s_wait_bvhcnt 0x0 8051; GFX12-NEXT: s_wait_kmcnt 0x0 8052; GFX12-NEXT: v_mov_b32_e32 v1, s16 8053; GFX12-NEXT: s_wait_storecnt 0x0 8054; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 8055; GFX12-NEXT: s_wait_storecnt 0x0 8056; GFX12-NEXT: global_inv scope:SCOPE_DEV 8057; GFX12-NEXT: s_setpc_b64 s[30:31] 8058; 8059; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8060; GFX940: ; %bb.0: 8061; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8062; GFX940-NEXT: v_mov_b32_e32 v1, s16 8063; GFX940-NEXT: buffer_wbl2 sc1 8064; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 8065; GFX940-NEXT: s_waitcnt vmcnt(0) 8066; GFX940-NEXT: buffer_inv sc1 8067; GFX940-NEXT: s_setpc_b64 s[30:31] 8068; 8069; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8070; GFX11: ; %bb.0: 8071; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8072; GFX11-NEXT: v_mov_b32_e32 v1, s16 8073; GFX11-NEXT: s_add_i32 s4, s16, 0x400 8074; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8075; GFX11-NEXT: v_mov_b32_e32 v3, s4 8076; GFX11-NEXT: s_mov_b32 s4, 0 8077; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 8078; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start 8079; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8080; GFX11-NEXT: s_waitcnt vmcnt(0) 8081; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 8082; GFX11-NEXT: v_mov_b32_e32 v5, v2 8083; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8084; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 8085; GFX11-NEXT: v_mov_b32_e32 v4, v1 8086; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc 8087; GFX11-NEXT: s_waitcnt vmcnt(0) 8088; GFX11-NEXT: buffer_gl1_inv 8089; GFX11-NEXT: buffer_gl0_inv 8090; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 8091; GFX11-NEXT: v_mov_b32_e32 v2, v4 8092; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 8093; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8094; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 8095; GFX11-NEXT: s_cbranch_execnz .LBB25_1 8096; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8097; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 8098; GFX11-NEXT: s_setpc_b64 s[30:31] 8099; 8100; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8101; GFX10: ; %bb.0: 8102; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8103; GFX10-NEXT: v_mov_b32_e32 v1, s20 8104; GFX10-NEXT: s_add_i32 s4, s20, 0x400 8105; GFX10-NEXT: v_mov_b32_e32 v3, s4 8106; GFX10-NEXT: s_mov_b32 s4, 0 8107; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 8108; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start 8109; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8110; GFX10-NEXT: s_waitcnt vmcnt(0) 8111; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 8112; GFX10-NEXT: v_mov_b32_e32 v5, v2 8113; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8114; GFX10-NEXT: v_mov_b32_e32 v4, v1 8115; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 8116; GFX10-NEXT: s_waitcnt vmcnt(0) 8117; GFX10-NEXT: buffer_gl1_inv 8118; GFX10-NEXT: buffer_gl0_inv 8119; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 8120; GFX10-NEXT: v_mov_b32_e32 v2, v4 8121; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8122; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8123; GFX10-NEXT: s_cbranch_execnz .LBB25_1 8124; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8125; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8126; GFX10-NEXT: s_setpc_b64 s[30:31] 8127; 8128; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8129; GFX90A: ; %bb.0: 8130; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8131; GFX90A-NEXT: v_mov_b32_e32 v1, s20 8132; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 8133; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 8134; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8135; GFX90A-NEXT: v_mov_b32_e32 v1, s6 8136; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start 8137; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8138; GFX90A-NEXT: s_waitcnt vmcnt(0) 8139; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 8140; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 8141; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc 8142; GFX90A-NEXT: s_waitcnt vmcnt(0) 8143; GFX90A-NEXT: buffer_wbinvl1 8144; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 8145; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8146; GFX90A-NEXT: v_mov_b32_e32 v3, v4 8147; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8148; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 8149; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8150; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8151; GFX90A-NEXT: s_setpc_b64 s[30:31] 8152; 8153; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8154; GFX908: ; %bb.0: 8155; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8156; GFX908-NEXT: v_mov_b32_e32 v1, s20 8157; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 8158; GFX908-NEXT: s_add_i32 s6, s20, 0x400 8159; GFX908-NEXT: s_mov_b64 s[4:5], 0 8160; GFX908-NEXT: v_mov_b32_e32 v3, s6 8161; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start 8162; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8163; GFX908-NEXT: s_waitcnt vmcnt(0) 8164; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 8165; GFX908-NEXT: v_mov_b32_e32 v5, v2 8166; GFX908-NEXT: v_mov_b32_e32 v4, v1 8167; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 8168; GFX908-NEXT: s_waitcnt vmcnt(0) 8169; GFX908-NEXT: buffer_wbinvl1 8170; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 8171; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8172; GFX908-NEXT: v_mov_b32_e32 v2, v4 8173; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8174; GFX908-NEXT: s_cbranch_execnz .LBB25_1 8175; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8176; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8177; GFX908-NEXT: s_setpc_b64 s[30:31] 8178; 8179; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8180; GFX8: ; %bb.0: 8181; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8182; GFX8-NEXT: v_mov_b32_e32 v1, s20 8183; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 8184; GFX8-NEXT: s_add_i32 s6, s20, 0x400 8185; GFX8-NEXT: s_mov_b64 s[4:5], 0 8186; GFX8-NEXT: v_mov_b32_e32 v3, s6 8187; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start 8188; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8189; GFX8-NEXT: s_waitcnt vmcnt(0) 8190; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 8191; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8192; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 8193; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 8194; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 8195; GFX8-NEXT: v_mov_b32_e32 v5, v2 8196; GFX8-NEXT: v_mov_b32_e32 v4, v1 8197; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 8198; GFX8-NEXT: s_waitcnt vmcnt(0) 8199; GFX8-NEXT: buffer_wbinvl1 8200; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 8201; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8202; GFX8-NEXT: v_mov_b32_e32 v2, v4 8203; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8204; GFX8-NEXT: s_cbranch_execnz .LBB25_1 8205; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8206; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8207; GFX8-NEXT: s_setpc_b64 s[30:31] 8208; 8209; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8210; GFX7: ; %bb.0: 8211; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8212; GFX7-NEXT: v_mov_b32_e32 v2, s20 8213; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 8214; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 8215; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 8216; GFX7-NEXT: s_add_i32 s6, s20, 0x400 8217; GFX7-NEXT: s_mov_b64 s[4:5], 0 8218; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 8219; GFX7-NEXT: s_waitcnt vmcnt(0) 8220; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 8221; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 8222; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 8223; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 8224; GFX7-NEXT: v_mov_b32_e32 v2, s6 8225; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start 8226; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8227; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 8228; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 8229; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 8230; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 8231; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 8232; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 8233; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 8234; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 8235; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 8236; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 8237; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 8238; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 8239; GFX7-NEXT: v_mov_b32_e32 v7, v5 8240; GFX7-NEXT: v_mov_b32_e32 v6, v4 8241; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 8242; GFX7-NEXT: s_waitcnt vmcnt(0) 8243; GFX7-NEXT: buffer_wbinvl1 8244; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 8245; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 8246; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 8247; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 8248; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8249; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8250; GFX7-NEXT: s_cbranch_execnz .LBB25_1 8251; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8252; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8253; GFX7-NEXT: s_setpc_b64 s[30:31] 8254; 8255; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: 8256; GFX6: ; %bb.0: 8257; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8258; GFX6-NEXT: v_mov_b32_e32 v2, s20 8259; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 8260; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 8261; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 8262; GFX6-NEXT: s_add_i32 s6, s20, 0x400 8263; GFX6-NEXT: s_mov_b64 s[4:5], 0 8264; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 8265; GFX6-NEXT: s_waitcnt vmcnt(0) 8266; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 8267; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 8268; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 8269; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 8270; GFX6-NEXT: v_mov_b32_e32 v2, s6 8271; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start 8272; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 8273; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 8274; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 8275; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 8276; GFX6-NEXT: s_waitcnt expcnt(0) 8277; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 8278; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 8279; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 8280; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 8281; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 8282; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 8283; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 8284; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 8285; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 8286; GFX6-NEXT: v_mov_b32_e32 v7, v5 8287; GFX6-NEXT: v_mov_b32_e32 v6, v4 8288; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 8289; GFX6-NEXT: s_waitcnt vmcnt(0) 8290; GFX6-NEXT: buffer_wbinvl1 8291; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 8292; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 8293; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 8294; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 8295; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8296; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 8297; GFX6-NEXT: s_cbranch_execnz .LBB25_1 8298; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 8299; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 8300; GFX6-NEXT: s_waitcnt expcnt(0) 8301; GFX6-NEXT: s_setpc_b64 s[30:31] 8302 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 8303 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 8304 ret void 8305} 8306 8307; -------------------------------------------------------------------- 8308; <2 x bfloat> 8309; -------------------------------------------------------------------- 8310 8311define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 8312; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8313; GFX12: ; %bb.0: 8314; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8315; GFX12-NEXT: s_wait_expcnt 0x0 8316; GFX12-NEXT: s_wait_samplecnt 0x0 8317; GFX12-NEXT: s_wait_bvhcnt 0x0 8318; GFX12-NEXT: s_wait_kmcnt 0x0 8319; GFX12-NEXT: v_mov_b32_e32 v1, s16 8320; GFX12-NEXT: s_wait_storecnt 0x0 8321; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 8322; GFX12-NEXT: s_wait_loadcnt 0x0 8323; GFX12-NEXT: global_inv scope:SCOPE_DEV 8324; GFX12-NEXT: s_setpc_b64 s[30:31] 8325; 8326; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8327; GFX940: ; %bb.0: 8328; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8329; GFX940-NEXT: v_mov_b32_e32 v1, v0 8330; GFX940-NEXT: v_mov_b32_e32 v0, s16 8331; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 8332; GFX940-NEXT: s_add_i32 s4, s16, 0x400 8333; GFX940-NEXT: s_mov_b64 s[6:7], 0 8334; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8335; GFX940-NEXT: s_movk_i32 s8, 0x7fff 8336; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8337; GFX940-NEXT: s_mov_b32 s9, 0x7060302 8338; GFX940-NEXT: v_mov_b32_e32 v4, s4 8339; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start 8340; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8341; GFX940-NEXT: s_waitcnt vmcnt(0) 8342; GFX940-NEXT: v_mov_b32_e32 v7, v0 8343; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 8344; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 8345; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 8346; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 8347; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 8348; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 8349; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 8350; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 8351; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 8352; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 8353; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 8354; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8355; GFX940-NEXT: buffer_wbl2 sc1 8356; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 8357; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 8358; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 8359; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] 8360; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 8361; GFX940-NEXT: s_waitcnt vmcnt(0) 8362; GFX940-NEXT: buffer_inv sc1 8363; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 8364; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8365; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 8366; GFX940-NEXT: s_cbranch_execnz .LBB26_1 8367; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8368; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 8369; GFX940-NEXT: s_setpc_b64 s[30:31] 8370; 8371; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8372; GFX11: ; %bb.0: 8373; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8374; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 8375; GFX11-NEXT: s_add_i32 s4, s16, 0x400 8376; GFX11-NEXT: s_mov_b32 s5, 0 8377; GFX11-NEXT: v_mov_b32_e32 v4, s4 8378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 8379; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8380; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 8381; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8382; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 8383; GFX11-NEXT: .p2align 6 8384; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start 8385; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8386; GFX11-NEXT: s_waitcnt vmcnt(0) 8387; GFX11-NEXT: v_mov_b32_e32 v6, v0 8388; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8389; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8390; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 8391; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 8392; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 8393; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 8394; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 8395; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 8396; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 8397; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8398; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 8399; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 8400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 8401; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 8402; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 8403; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 8404; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 8405; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8406; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 8407; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 8408; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8409; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 8410; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc 8411; GFX11-NEXT: s_waitcnt vmcnt(0) 8412; GFX11-NEXT: buffer_gl1_inv 8413; GFX11-NEXT: buffer_gl0_inv 8414; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 8415; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 8416; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8417; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 8418; GFX11-NEXT: s_cbranch_execnz .LBB26_1 8419; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8420; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 8421; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 8422; GFX11-NEXT: s_setpc_b64 s[30:31] 8423; 8424; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8425; GFX10: ; %bb.0: 8426; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8427; GFX10-NEXT: v_mov_b32_e32 v1, v0 8428; GFX10-NEXT: v_mov_b32_e32 v0, s20 8429; GFX10-NEXT: s_add_i32 s4, s20, 0x400 8430; GFX10-NEXT: s_mov_b32 s5, 0 8431; GFX10-NEXT: v_mov_b32_e32 v4, s4 8432; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8433; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8434; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8435; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start 8436; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8437; GFX10-NEXT: s_waitcnt vmcnt(0) 8438; GFX10-NEXT: v_mov_b32_e32 v6, v0 8439; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8440; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 8441; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 8442; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 8443; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 8444; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 8445; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 8446; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 8447; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 8448; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 8449; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 8450; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 8451; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 8452; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo 8453; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 8454; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 8455; GFX10-NEXT: v_mov_b32_e32 v0, v5 8456; GFX10-NEXT: v_mov_b32_e32 v1, v6 8457; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 8458; GFX10-NEXT: s_waitcnt vmcnt(0) 8459; GFX10-NEXT: buffer_gl1_inv 8460; GFX10-NEXT: buffer_gl0_inv 8461; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 8462; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 8463; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 8464; GFX10-NEXT: s_cbranch_execnz .LBB26_1 8465; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8466; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 8467; GFX10-NEXT: s_setpc_b64 s[30:31] 8468; 8469; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8470; GFX90A: ; %bb.0: 8471; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8472; GFX90A-NEXT: v_mov_b32_e32 v1, v0 8473; GFX90A-NEXT: v_mov_b32_e32 v0, s20 8474; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8475; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 8476; GFX90A-NEXT: s_mov_b64 s[6:7], 0 8477; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8478; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 8479; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8480; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 8481; GFX90A-NEXT: v_mov_b32_e32 v4, s4 8482; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start 8483; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8484; GFX90A-NEXT: s_waitcnt vmcnt(0) 8485; GFX90A-NEXT: v_mov_b32_e32 v7, v0 8486; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 8487; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 8488; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 8489; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 8490; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 8491; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 8492; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 8493; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 8494; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 8495; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 8496; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 8497; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8498; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 8499; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 8500; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 8501; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] 8502; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 8503; GFX90A-NEXT: s_waitcnt vmcnt(0) 8504; GFX90A-NEXT: buffer_wbinvl1 8505; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 8506; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8507; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 8508; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 8509; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8510; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 8511; GFX90A-NEXT: s_setpc_b64 s[30:31] 8512; 8513; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8514; GFX908: ; %bb.0: 8515; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8516; GFX908-NEXT: v_mov_b32_e32 v1, v0 8517; GFX908-NEXT: v_mov_b32_e32 v0, s20 8518; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8519; GFX908-NEXT: s_add_i32 s4, s20, 0x400 8520; GFX908-NEXT: s_mov_b64 s[6:7], 0 8521; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8522; GFX908-NEXT: s_movk_i32 s8, 0x7fff 8523; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8524; GFX908-NEXT: s_mov_b32 s9, 0x7060302 8525; GFX908-NEXT: v_mov_b32_e32 v4, s4 8526; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start 8527; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8528; GFX908-NEXT: s_waitcnt vmcnt(0) 8529; GFX908-NEXT: v_mov_b32_e32 v6, v0 8530; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 8531; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 8532; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 8533; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 8534; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 8535; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 8536; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 8537; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 8538; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 8539; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 8540; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 8541; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8542; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 8543; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 8544; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 8545; GFX908-NEXT: v_mov_b32_e32 v0, v5 8546; GFX908-NEXT: v_mov_b32_e32 v1, v6 8547; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 8548; GFX908-NEXT: s_waitcnt vmcnt(0) 8549; GFX908-NEXT: buffer_wbinvl1 8550; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 8551; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8552; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 8553; GFX908-NEXT: s_cbranch_execnz .LBB26_1 8554; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8555; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 8556; GFX908-NEXT: s_setpc_b64 s[30:31] 8557; 8558; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8559; GFX8: ; %bb.0: 8560; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8561; GFX8-NEXT: v_mov_b32_e32 v1, v0 8562; GFX8-NEXT: v_mov_b32_e32 v0, s20 8563; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8564; GFX8-NEXT: s_add_i32 s4, s20, 0x400 8565; GFX8-NEXT: s_mov_b64 s[6:7], 0 8566; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 8567; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8568; GFX8-NEXT: v_mov_b32_e32 v4, s4 8569; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start 8570; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8571; GFX8-NEXT: s_waitcnt vmcnt(0) 8572; GFX8-NEXT: v_mov_b32_e32 v6, v0 8573; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 8574; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 8575; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 8576; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 8577; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 8578; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 8579; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 8580; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 8581; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 8582; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 8583; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 8584; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 8585; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 8586; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8587; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 8588; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 8589; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 8590; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 8591; GFX8-NEXT: v_mov_b32_e32 v0, v5 8592; GFX8-NEXT: v_mov_b32_e32 v1, v6 8593; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 8594; GFX8-NEXT: s_waitcnt vmcnt(0) 8595; GFX8-NEXT: buffer_wbinvl1 8596; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 8597; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8598; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 8599; GFX8-NEXT: s_cbranch_execnz .LBB26_1 8600; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8601; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 8602; GFX8-NEXT: s_setpc_b64 s[30:31] 8603; 8604; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8605; GFX7: ; %bb.0: 8606; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8607; GFX7-NEXT: v_mov_b32_e32 v2, s20 8608; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 8609; GFX7-NEXT: s_add_i32 s6, s20, 0x400 8610; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 8611; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 8612; GFX7-NEXT: s_mov_b64 s[4:5], 0 8613; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 8614; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8615; GFX7-NEXT: s_waitcnt vmcnt(0) 8616; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 8617; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 8618; GFX7-NEXT: v_mov_b32_e32 v4, s6 8619; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start 8620; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8621; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 8622; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 8623; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 8624; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 8625; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 8626; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 8627; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 8628; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 8629; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 8630; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 8631; GFX7-NEXT: v_mov_b32_e32 v6, v1 8632; GFX7-NEXT: v_mov_b32_e32 v5, v0 8633; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 8634; GFX7-NEXT: s_waitcnt vmcnt(0) 8635; GFX7-NEXT: buffer_wbinvl1 8636; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 8637; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 8638; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8639; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 8640; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8641; GFX7-NEXT: s_cbranch_execnz .LBB26_1 8642; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8643; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8644; GFX7-NEXT: s_setpc_b64 s[30:31] 8645; 8646; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8647; GFX6: ; %bb.0: 8648; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8649; GFX6-NEXT: v_mov_b32_e32 v2, s20 8650; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 8651; GFX6-NEXT: s_add_i32 s6, s20, 0x400 8652; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 8653; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 8654; GFX6-NEXT: s_mov_b64 s[4:5], 0 8655; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 8656; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 8657; GFX6-NEXT: s_waitcnt vmcnt(0) 8658; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 8659; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 8660; GFX6-NEXT: v_mov_b32_e32 v4, s6 8661; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start 8662; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 8663; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 8664; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 8665; GFX6-NEXT: s_waitcnt expcnt(0) 8666; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 8667; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 8668; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 8669; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 8670; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 8671; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 8672; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 8673; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 8674; GFX6-NEXT: v_mov_b32_e32 v6, v1 8675; GFX6-NEXT: v_mov_b32_e32 v5, v0 8676; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 8677; GFX6-NEXT: s_waitcnt vmcnt(0) 8678; GFX6-NEXT: buffer_wbinvl1 8679; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 8680; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 8681; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8682; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 8683; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 8684; GFX6-NEXT: s_cbranch_execnz .LBB26_1 8685; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 8686; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 8687; GFX6-NEXT: s_waitcnt expcnt(0) 8688; GFX6-NEXT: s_setpc_b64 s[30:31] 8689 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 8690 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8691 ret <2 x bfloat> %result 8692} 8693 8694define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 8695; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8696; GFX12: ; %bb.0: 8697; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8698; GFX12-NEXT: s_wait_expcnt 0x0 8699; GFX12-NEXT: s_wait_samplecnt 0x0 8700; GFX12-NEXT: s_wait_bvhcnt 0x0 8701; GFX12-NEXT: s_wait_kmcnt 0x0 8702; GFX12-NEXT: v_mov_b32_e32 v1, s16 8703; GFX12-NEXT: s_wait_storecnt 0x0 8704; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 8705; GFX12-NEXT: s_wait_storecnt 0x0 8706; GFX12-NEXT: global_inv scope:SCOPE_DEV 8707; GFX12-NEXT: s_setpc_b64 s[30:31] 8708; 8709; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8710; GFX940: ; %bb.0: 8711; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8712; GFX940-NEXT: v_mov_b32_e32 v1, s16 8713; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 8714; GFX940-NEXT: s_add_i32 s4, s16, 0x400 8715; GFX940-NEXT: s_mov_b64 s[6:7], 0 8716; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 8717; GFX940-NEXT: s_movk_i32 s8, 0x7fff 8718; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 8719; GFX940-NEXT: s_mov_b32 s9, 0x7060302 8720; GFX940-NEXT: v_mov_b32_e32 v4, s4 8721; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start 8722; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8723; GFX940-NEXT: s_waitcnt vmcnt(0) 8724; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8725; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8726; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 8727; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 8728; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 8729; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 8730; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 8731; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 8732; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 8733; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 8734; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8735; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8736; GFX940-NEXT: buffer_wbl2 sc1 8737; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 8738; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 8739; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 8740; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] 8741; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 8742; GFX940-NEXT: s_waitcnt vmcnt(0) 8743; GFX940-NEXT: buffer_inv sc1 8744; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 8745; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8746; GFX940-NEXT: v_mov_b32_e32 v1, v6 8747; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 8748; GFX940-NEXT: s_cbranch_execnz .LBB27_1 8749; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8750; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 8751; GFX940-NEXT: s_setpc_b64 s[30:31] 8752; 8753; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8754; GFX11: ; %bb.0: 8755; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8756; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 8757; GFX11-NEXT: s_add_i32 s4, s16, 0x400 8758; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8759; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 8760; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 8761; GFX11-NEXT: s_mov_b32 s5, 0 8762; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 8763; GFX11-NEXT: .p2align 6 8764; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start 8765; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8766; GFX11-NEXT: s_waitcnt vmcnt(0) 8767; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8768; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8769; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8770; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8771; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 8772; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 8773; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 8774; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 8775; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 8776; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 8777; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8778; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8779; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 8780; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 8781; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 8782; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 8783; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 8784; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8785; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 8786; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 8787; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc 8788; GFX11-NEXT: s_waitcnt vmcnt(0) 8789; GFX11-NEXT: buffer_gl1_inv 8790; GFX11-NEXT: buffer_gl0_inv 8791; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 8792; GFX11-NEXT: v_mov_b32_e32 v1, v5 8793; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 8794; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8795; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 8796; GFX11-NEXT: s_cbranch_execnz .LBB27_1 8797; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8798; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 8799; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 8800; GFX11-NEXT: s_setpc_b64 s[30:31] 8801; 8802; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8803; GFX10: ; %bb.0: 8804; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8805; GFX10-NEXT: v_mov_b32_e32 v1, s20 8806; GFX10-NEXT: s_add_i32 s4, s20, 0x400 8807; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 8808; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 8809; GFX10-NEXT: v_mov_b32_e32 v4, s4 8810; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 8811; GFX10-NEXT: s_mov_b32 s5, 0 8812; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start 8813; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8814; GFX10-NEXT: s_waitcnt vmcnt(0) 8815; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8816; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8817; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8818; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 8819; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 8820; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 8821; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 8822; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 8823; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 8824; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8825; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 8826; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8827; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 8828; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 8829; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 8830; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 8831; GFX10-NEXT: v_mov_b32_e32 v6, v1 8832; GFX10-NEXT: v_mov_b32_e32 v5, v0 8833; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 8834; GFX10-NEXT: s_waitcnt vmcnt(0) 8835; GFX10-NEXT: buffer_gl1_inv 8836; GFX10-NEXT: buffer_gl0_inv 8837; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 8838; GFX10-NEXT: v_mov_b32_e32 v1, v5 8839; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 8840; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 8841; GFX10-NEXT: s_cbranch_execnz .LBB27_1 8842; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8843; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 8844; GFX10-NEXT: s_setpc_b64 s[30:31] 8845; 8846; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8847; GFX90A: ; %bb.0: 8848; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8849; GFX90A-NEXT: v_mov_b32_e32 v1, s20 8850; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 8851; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 8852; GFX90A-NEXT: s_mov_b64 s[6:7], 0 8853; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 8854; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 8855; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 8856; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 8857; GFX90A-NEXT: v_mov_b32_e32 v4, s4 8858; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start 8859; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8860; GFX90A-NEXT: s_waitcnt vmcnt(0) 8861; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8862; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8863; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 8864; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 8865; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 8866; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 8867; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 8868; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 8869; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 8870; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 8871; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8872; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8873; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 8874; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 8875; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 8876; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] 8877; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc 8878; GFX90A-NEXT: s_waitcnt vmcnt(0) 8879; GFX90A-NEXT: buffer_wbinvl1 8880; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 8881; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8882; GFX90A-NEXT: v_mov_b32_e32 v1, v6 8883; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 8884; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 8885; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8886; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 8887; GFX90A-NEXT: s_setpc_b64 s[30:31] 8888; 8889; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8890; GFX908: ; %bb.0: 8891; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8892; GFX908-NEXT: v_mov_b32_e32 v1, s20 8893; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 8894; GFX908-NEXT: s_add_i32 s4, s20, 0x400 8895; GFX908-NEXT: s_mov_b64 s[6:7], 0 8896; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 8897; GFX908-NEXT: s_movk_i32 s8, 0x7fff 8898; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 8899; GFX908-NEXT: s_mov_b32 s9, 0x7060302 8900; GFX908-NEXT: v_mov_b32_e32 v4, s4 8901; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start 8902; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8903; GFX908-NEXT: s_waitcnt vmcnt(0) 8904; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8905; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8906; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 8907; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 8908; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 8909; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 8910; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 8911; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 8912; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 8913; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 8914; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8915; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8916; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 8917; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 8918; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 8919; GFX908-NEXT: v_mov_b32_e32 v6, v1 8920; GFX908-NEXT: v_mov_b32_e32 v5, v0 8921; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 8922; GFX908-NEXT: s_waitcnt vmcnt(0) 8923; GFX908-NEXT: buffer_wbinvl1 8924; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 8925; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8926; GFX908-NEXT: v_mov_b32_e32 v1, v5 8927; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 8928; GFX908-NEXT: s_cbranch_execnz .LBB27_1 8929; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8930; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 8931; GFX908-NEXT: s_setpc_b64 s[30:31] 8932; 8933; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8934; GFX8: ; %bb.0: 8935; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8936; GFX8-NEXT: v_mov_b32_e32 v1, s20 8937; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 8938; GFX8-NEXT: s_add_i32 s4, s20, 0x400 8939; GFX8-NEXT: s_mov_b64 s[6:7], 0 8940; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 8941; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 8942; GFX8-NEXT: v_mov_b32_e32 v4, s4 8943; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start 8944; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8945; GFX8-NEXT: s_waitcnt vmcnt(0) 8946; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 8947; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 8948; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 8949; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 8950; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 8951; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 8952; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 8953; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 8954; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 8955; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 8956; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 8957; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8958; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 8959; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 8960; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 8961; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 8962; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8963; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 8964; GFX8-NEXT: v_mov_b32_e32 v6, v1 8965; GFX8-NEXT: v_mov_b32_e32 v5, v0 8966; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 8967; GFX8-NEXT: s_waitcnt vmcnt(0) 8968; GFX8-NEXT: buffer_wbinvl1 8969; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 8970; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8971; GFX8-NEXT: v_mov_b32_e32 v1, v5 8972; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 8973; GFX8-NEXT: s_cbranch_execnz .LBB27_1 8974; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8975; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 8976; GFX8-NEXT: s_setpc_b64 s[30:31] 8977; 8978; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 8979; GFX7: ; %bb.0: 8980; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8981; GFX7-NEXT: v_mov_b32_e32 v2, s20 8982; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 8983; GFX7-NEXT: s_add_i32 s6, s20, 0x400 8984; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 8985; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 8986; GFX7-NEXT: s_mov_b64 s[4:5], 0 8987; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 8988; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 8989; GFX7-NEXT: s_waitcnt vmcnt(0) 8990; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 8991; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 8992; GFX7-NEXT: v_mov_b32_e32 v2, s6 8993; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start 8994; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8995; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 8996; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 8997; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 8998; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 8999; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9000; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 9001; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 9002; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 9003; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 9004; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 9005; GFX7-NEXT: v_mov_b32_e32 v6, v4 9006; GFX7-NEXT: v_mov_b32_e32 v5, v3 9007; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 9008; GFX7-NEXT: s_waitcnt vmcnt(0) 9009; GFX7-NEXT: buffer_wbinvl1 9010; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 9011; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 9012; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9013; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 9014; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9015; GFX7-NEXT: s_cbranch_execnz .LBB27_1 9016; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9017; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9018; GFX7-NEXT: s_setpc_b64 s[30:31] 9019; 9020; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 9021; GFX6: ; %bb.0: 9022; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9023; GFX6-NEXT: v_mov_b32_e32 v2, s20 9024; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 9025; GFX6-NEXT: s_add_i32 s6, s20, 0x400 9026; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 9027; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 9028; GFX6-NEXT: s_mov_b64 s[4:5], 0 9029; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9030; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9031; GFX6-NEXT: s_waitcnt vmcnt(0) 9032; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 9033; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 9034; GFX6-NEXT: v_mov_b32_e32 v2, s6 9035; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start 9036; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 9037; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 9038; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 9039; GFX6-NEXT: s_waitcnt expcnt(0) 9040; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 9041; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 9042; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9043; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 9044; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 9045; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 9046; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 9047; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 9048; GFX6-NEXT: v_mov_b32_e32 v6, v4 9049; GFX6-NEXT: v_mov_b32_e32 v5, v3 9050; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 9051; GFX6-NEXT: s_waitcnt vmcnt(0) 9052; GFX6-NEXT: buffer_wbinvl1 9053; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 9054; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 9055; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9056; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 9057; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 9058; GFX6-NEXT: s_cbranch_execnz .LBB27_1 9059; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 9060; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 9061; GFX6-NEXT: s_waitcnt expcnt(0) 9062; GFX6-NEXT: s_setpc_b64 s[30:31] 9063 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 9064 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9065 ret void 9066} 9067 9068define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { 9069; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9070; GFX12: ; %bb.0: 9071; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9072; GFX12-NEXT: s_wait_expcnt 0x0 9073; GFX12-NEXT: s_wait_samplecnt 0x0 9074; GFX12-NEXT: s_wait_bvhcnt 0x0 9075; GFX12-NEXT: s_wait_kmcnt 0x0 9076; GFX12-NEXT: s_mov_b32 s1, exec_lo 9077; GFX12-NEXT: s_wait_storecnt 0x0 9078; GFX12-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9079; GFX12-NEXT: v_readfirstlane_b32 s4, v0 9080; GFX12-NEXT: v_readfirstlane_b32 s5, v1 9081; GFX12-NEXT: v_readfirstlane_b32 s6, v2 9082; GFX12-NEXT: v_readfirstlane_b32 s7, v3 9083; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 9084; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 9085; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 9086; GFX12-NEXT: s_wait_alu 0xfffe 9087; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9088; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 9089; GFX12-NEXT: s_wait_alu 0xfffe 9090; GFX12-NEXT: s_and_saveexec_b32 s0, s0 9091; GFX12-NEXT: s_wait_loadcnt 0x0 9092; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN 9093; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 9094; GFX12-NEXT: ; implicit-def: $vgpr4 9095; GFX12-NEXT: s_wait_alu 0xfffe 9096; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 9097; GFX12-NEXT: s_cbranch_execnz .LBB28_1 9098; GFX12-NEXT: ; %bb.2: 9099; GFX12-NEXT: s_mov_b32 exec_lo, s1 9100; GFX12-NEXT: s_wait_loadcnt 0x0 9101; GFX12-NEXT: v_mov_b32_e32 v0, v5 9102; GFX12-NEXT: global_inv scope:SCOPE_DEV 9103; GFX12-NEXT: s_wait_alu 0xfffe 9104; GFX12-NEXT: s_setpc_b64 s[30:31] 9105; 9106; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9107; GFX940: ; %bb.0: 9108; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9109; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 9110; GFX940-NEXT: s_mov_b64 s[2:3], exec 9111; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9112; GFX940-NEXT: v_readfirstlane_b32 s4, v0 9113; GFX940-NEXT: v_readfirstlane_b32 s5, v1 9114; GFX940-NEXT: v_readfirstlane_b32 s6, v2 9115; GFX940-NEXT: v_readfirstlane_b32 s7, v3 9116; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 9117; GFX940-NEXT: s_nop 0 9118; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 9119; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 9120; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 9121; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 9122; GFX940-NEXT: ; implicit-def: $vgpr4 9123; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 9124; GFX940-NEXT: s_cbranch_execnz .LBB28_1 9125; GFX940-NEXT: ; %bb.2: 9126; GFX940-NEXT: s_mov_b64 exec, s[2:3] 9127; GFX940-NEXT: s_mov_b64 s[2:3], 0 9128; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 9129; GFX940-NEXT: s_movk_i32 s10, 0x7fff 9130; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 9131; GFX940-NEXT: s_mov_b32 s11, 0x7060302 9132; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start 9133; GFX940-NEXT: ; =>This Loop Header: Depth=1 9134; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 9135; GFX940-NEXT: s_waitcnt vmcnt(0) 9136; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 9137; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 9138; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 9139; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 9140; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 9141; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9142; GFX940-NEXT: s_mov_b64 s[8:9], exec 9143; GFX940-NEXT: buffer_wbl2 sc1 9144; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 9145; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 9146; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 9147; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 9148; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 9149; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 9150; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9151; GFX940-NEXT: s_nop 1 9152; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc 9153; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 9154; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 9155; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9156; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 9157; GFX940-NEXT: v_readfirstlane_b32 s4, v0 9158; GFX940-NEXT: v_readfirstlane_b32 s5, v1 9159; GFX940-NEXT: v_readfirstlane_b32 s6, v2 9160; GFX940-NEXT: v_readfirstlane_b32 s7, v3 9161; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 9162; GFX940-NEXT: s_nop 0 9163; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 9164; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 9165; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 9166; GFX940-NEXT: s_waitcnt vmcnt(0) 9167; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 9168; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 9169; GFX940-NEXT: s_cbranch_execnz .LBB28_4 9170; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9171; GFX940-NEXT: s_mov_b64 exec, s[8:9] 9172; GFX940-NEXT: s_waitcnt vmcnt(0) 9173; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 9174; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 9175; GFX940-NEXT: v_mov_b32_e32 v7, v4 9176; GFX940-NEXT: buffer_inv sc1 9177; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 9178; GFX940-NEXT: s_cbranch_execnz .LBB28_3 9179; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 9180; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 9181; GFX940-NEXT: v_mov_b32_e32 v0, v4 9182; GFX940-NEXT: s_setpc_b64 s[30:31] 9183; 9184; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9185; GFX11: ; %bb.0: 9186; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9187; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 9188; GFX11-NEXT: s_mov_b32 s1, 0 9189; GFX11-NEXT: s_mov_b32 s2, exec_lo 9190; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9191; GFX11-NEXT: v_readfirstlane_b32 s4, v0 9192; GFX11-NEXT: v_readfirstlane_b32 s5, v1 9193; GFX11-NEXT: v_readfirstlane_b32 s6, v2 9194; GFX11-NEXT: v_readfirstlane_b32 s7, v3 9195; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 9196; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 9197; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 9198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 9199; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 9200; GFX11-NEXT: s_and_saveexec_b32 s0, s0 9201; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 9202; GFX11-NEXT: ; implicit-def: $vgpr4 9203; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 9204; GFX11-NEXT: s_cbranch_execnz .LBB28_1 9205; GFX11-NEXT: ; %bb.2: 9206; GFX11-NEXT: s_mov_b32 exec_lo, s2 9207; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 9208; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9209; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 9210; GFX11-NEXT: .p2align 6 9211; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start 9212; GFX11-NEXT: ; =>This Loop Header: Depth=1 9213; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 9214; GFX11-NEXT: s_waitcnt vmcnt(0) 9215; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 9216; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 9217; GFX11-NEXT: s_mov_b32 s2, exec_lo 9218; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9219; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9220; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 9221; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 9222; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 9223; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 9224; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 9225; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9226; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 9227; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff 9228; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff 9229; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 9230; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo 9231; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9232; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo 9233; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9234; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 9235; GFX11-NEXT: v_mov_b32_e32 v4, v5 9236; GFX11-NEXT: v_mov_b32_e32 v5, v6 9237; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9238; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 9239; GFX11-NEXT: v_readfirstlane_b32 s4, v0 9240; GFX11-NEXT: v_readfirstlane_b32 s5, v1 9241; GFX11-NEXT: v_readfirstlane_b32 s6, v2 9242; GFX11-NEXT: v_readfirstlane_b32 s7, v3 9243; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 9244; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 9245; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 9246; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 9247; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 9248; GFX11-NEXT: s_and_saveexec_b32 s0, s0 9249; GFX11-NEXT: s_waitcnt vmcnt(0) 9250; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc 9251; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 9252; GFX11-NEXT: s_cbranch_execnz .LBB28_4 9253; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9254; GFX11-NEXT: s_mov_b32 exec_lo, s2 9255; GFX11-NEXT: s_waitcnt vmcnt(0) 9256; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 9257; GFX11-NEXT: v_mov_b32_e32 v6, v4 9258; GFX11-NEXT: buffer_gl1_inv 9259; GFX11-NEXT: buffer_gl0_inv 9260; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 9261; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9262; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 9263; GFX11-NEXT: s_cbranch_execnz .LBB28_3 9264; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 9265; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 9266; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 9267; GFX11-NEXT: v_mov_b32_e32 v0, v4 9268; GFX11-NEXT: s_setpc_b64 s[30:31] 9269; 9270; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9271; GFX10: ; %bb.0: 9272; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9273; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 9274; GFX10-NEXT: s_mov_b32 s5, 0 9275; GFX10-NEXT: s_mov_b32 s6, exec_lo 9276; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9277; GFX10-NEXT: v_readfirstlane_b32 s8, v0 9278; GFX10-NEXT: v_readfirstlane_b32 s9, v1 9279; GFX10-NEXT: v_readfirstlane_b32 s10, v2 9280; GFX10-NEXT: v_readfirstlane_b32 s11, v3 9281; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 9282; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 9283; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 9284; GFX10-NEXT: s_and_saveexec_b32 s4, s4 9285; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 9286; GFX10-NEXT: ; implicit-def: $vgpr4 9287; GFX10-NEXT: s_waitcnt_depctr 0xffe3 9288; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 9289; GFX10-NEXT: s_cbranch_execnz .LBB28_1 9290; GFX10-NEXT: ; %bb.2: 9291; GFX10-NEXT: s_mov_b32 exec_lo, s6 9292; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 9293; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9294; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start 9295; GFX10-NEXT: ; =>This Loop Header: Depth=1 9296; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 9297; GFX10-NEXT: s_waitcnt vmcnt(0) 9298; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 9299; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 9300; GFX10-NEXT: s_mov_b32 s6, exec_lo 9301; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9302; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 9303; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 9304; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 9305; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 9306; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 9307; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9308; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 9309; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff 9310; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff 9311; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo 9312; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9313; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo 9314; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 9315; GFX10-NEXT: v_mov_b32_e32 v4, v5 9316; GFX10-NEXT: v_mov_b32_e32 v5, v6 9317; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9318; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 9319; GFX10-NEXT: v_readfirstlane_b32 s8, v0 9320; GFX10-NEXT: v_readfirstlane_b32 s9, v1 9321; GFX10-NEXT: v_readfirstlane_b32 s10, v2 9322; GFX10-NEXT: v_readfirstlane_b32 s11, v3 9323; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 9324; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 9325; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 9326; GFX10-NEXT: s_and_saveexec_b32 s4, s4 9327; GFX10-NEXT: s_waitcnt vmcnt(0) 9328; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 9329; GFX10-NEXT: s_waitcnt_depctr 0xffe3 9330; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 9331; GFX10-NEXT: s_cbranch_execnz .LBB28_4 9332; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9333; GFX10-NEXT: s_mov_b32 exec_lo, s6 9334; GFX10-NEXT: s_waitcnt vmcnt(0) 9335; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 9336; GFX10-NEXT: v_mov_b32_e32 v6, v4 9337; GFX10-NEXT: buffer_gl1_inv 9338; GFX10-NEXT: buffer_gl0_inv 9339; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 9340; GFX10-NEXT: s_waitcnt_depctr 0xffe3 9341; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 9342; GFX10-NEXT: s_cbranch_execnz .LBB28_3 9343; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 9344; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 9345; GFX10-NEXT: v_mov_b32_e32 v0, v4 9346; GFX10-NEXT: s_setpc_b64 s[30:31] 9347; 9348; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9349; GFX90A: ; %bb.0: 9350; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9351; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 9352; GFX90A-NEXT: s_mov_b64 s[6:7], exec 9353; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9354; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 9355; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 9356; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 9357; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 9358; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9359; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9360; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9361; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9362; GFX90A-NEXT: s_nop 0 9363; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 9364; GFX90A-NEXT: ; implicit-def: $vgpr4 9365; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 9366; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 9367; GFX90A-NEXT: ; %bb.2: 9368; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 9369; GFX90A-NEXT: s_mov_b64 s[6:7], 0 9370; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 9371; GFX90A-NEXT: s_movk_i32 s14, 0x7fff 9372; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 9373; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 9374; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start 9375; GFX90A-NEXT: ; =>This Loop Header: Depth=1 9376; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 9377; GFX90A-NEXT: s_waitcnt vmcnt(0) 9378; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 9379; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 9380; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 9381; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 9382; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 9383; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9384; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 9385; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 9386; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 9387; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9388; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 9389; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 9390; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9391; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc 9392; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 9393; GFX90A-NEXT: s_mov_b64 s[12:13], exec 9394; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 9395; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9396; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 9397; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 9398; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 9399; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 9400; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 9401; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9402; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9403; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9404; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9405; GFX90A-NEXT: s_waitcnt vmcnt(0) 9406; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 9407; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 9408; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 9409; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9410; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 9411; GFX90A-NEXT: s_waitcnt vmcnt(0) 9412; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 9413; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9414; GFX90A-NEXT: v_mov_b32_e32 v7, v4 9415; GFX90A-NEXT: buffer_wbinvl1 9416; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 9417; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 9418; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 9419; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 9420; GFX90A-NEXT: v_mov_b32_e32 v0, v4 9421; GFX90A-NEXT: s_setpc_b64 s[30:31] 9422; 9423; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9424; GFX908: ; %bb.0: 9425; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9426; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 9427; GFX908-NEXT: s_mov_b64 s[6:7], exec 9428; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9429; GFX908-NEXT: v_readfirstlane_b32 s8, v0 9430; GFX908-NEXT: v_readfirstlane_b32 s9, v1 9431; GFX908-NEXT: v_readfirstlane_b32 s10, v2 9432; GFX908-NEXT: v_readfirstlane_b32 s11, v3 9433; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9434; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9435; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9436; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9437; GFX908-NEXT: s_nop 0 9438; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 9439; GFX908-NEXT: ; implicit-def: $vgpr4 9440; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 9441; GFX908-NEXT: s_cbranch_execnz .LBB28_1 9442; GFX908-NEXT: ; %bb.2: 9443; GFX908-NEXT: s_mov_b64 exec, s[6:7] 9444; GFX908-NEXT: s_mov_b64 s[6:7], 0 9445; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 9446; GFX908-NEXT: s_movk_i32 s14, 0x7fff 9447; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9448; GFX908-NEXT: s_mov_b32 s15, 0x7060302 9449; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start 9450; GFX908-NEXT: ; =>This Loop Header: Depth=1 9451; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 9452; GFX908-NEXT: s_waitcnt vmcnt(0) 9453; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 9454; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 9455; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 9456; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 9457; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 9458; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9459; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc 9460; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 9461; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 9462; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 9463; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 9464; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 9465; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9466; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc 9467; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 9468; GFX908-NEXT: v_mov_b32_e32 v4, v5 9469; GFX908-NEXT: s_mov_b64 s[12:13], exec 9470; GFX908-NEXT: v_mov_b32_e32 v5, v6 9471; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9472; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 9473; GFX908-NEXT: v_readfirstlane_b32 s8, v0 9474; GFX908-NEXT: v_readfirstlane_b32 s9, v1 9475; GFX908-NEXT: v_readfirstlane_b32 s10, v2 9476; GFX908-NEXT: v_readfirstlane_b32 s11, v3 9477; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9478; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9479; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9480; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9481; GFX908-NEXT: s_waitcnt vmcnt(0) 9482; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 9483; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 9484; GFX908-NEXT: s_cbranch_execnz .LBB28_4 9485; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9486; GFX908-NEXT: s_mov_b64 exec, s[12:13] 9487; GFX908-NEXT: s_waitcnt vmcnt(0) 9488; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 9489; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9490; GFX908-NEXT: v_mov_b32_e32 v6, v4 9491; GFX908-NEXT: buffer_wbinvl1 9492; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 9493; GFX908-NEXT: s_cbranch_execnz .LBB28_3 9494; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 9495; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 9496; GFX908-NEXT: v_mov_b32_e32 v0, v4 9497; GFX908-NEXT: s_setpc_b64 s[30:31] 9498; 9499; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9500; GFX8: ; %bb.0: 9501; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9502; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 9503; GFX8-NEXT: s_mov_b64 s[6:7], exec 9504; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9505; GFX8-NEXT: v_readfirstlane_b32 s8, v0 9506; GFX8-NEXT: v_readfirstlane_b32 s9, v1 9507; GFX8-NEXT: v_readfirstlane_b32 s10, v2 9508; GFX8-NEXT: v_readfirstlane_b32 s11, v3 9509; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9510; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9511; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9512; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9513; GFX8-NEXT: s_nop 0 9514; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 9515; GFX8-NEXT: ; implicit-def: $vgpr4 9516; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 9517; GFX8-NEXT: s_cbranch_execnz .LBB28_1 9518; GFX8-NEXT: ; %bb.2: 9519; GFX8-NEXT: s_mov_b64 exec, s[6:7] 9520; GFX8-NEXT: s_mov_b64 s[6:7], 0 9521; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 9522; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9523; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start 9524; GFX8-NEXT: ; =>This Loop Header: Depth=1 9525; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 9526; GFX8-NEXT: s_waitcnt vmcnt(0) 9527; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 9528; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 9529; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 9530; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 9531; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 9532; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 9533; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9534; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc 9535; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 9536; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 9537; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 9538; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 9539; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 9540; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 9541; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9542; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc 9543; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9544; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 9545; GFX8-NEXT: v_mov_b32_e32 v4, v5 9546; GFX8-NEXT: s_mov_b64 s[12:13], exec 9547; GFX8-NEXT: v_mov_b32_e32 v5, v6 9548; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9549; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 9550; GFX8-NEXT: v_readfirstlane_b32 s8, v0 9551; GFX8-NEXT: v_readfirstlane_b32 s9, v1 9552; GFX8-NEXT: v_readfirstlane_b32 s10, v2 9553; GFX8-NEXT: v_readfirstlane_b32 s11, v3 9554; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9555; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9556; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9557; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9558; GFX8-NEXT: s_waitcnt vmcnt(0) 9559; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 9560; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 9561; GFX8-NEXT: s_cbranch_execnz .LBB28_4 9562; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9563; GFX8-NEXT: s_mov_b64 exec, s[12:13] 9564; GFX8-NEXT: s_waitcnt vmcnt(0) 9565; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 9566; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9567; GFX8-NEXT: v_mov_b32_e32 v6, v4 9568; GFX8-NEXT: buffer_wbinvl1 9569; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 9570; GFX8-NEXT: s_cbranch_execnz .LBB28_3 9571; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 9572; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 9573; GFX8-NEXT: v_mov_b32_e32 v0, v4 9574; GFX8-NEXT: s_setpc_b64 s[30:31] 9575; 9576; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9577; GFX7: ; %bb.0: 9578; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9579; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 9580; GFX7-NEXT: s_mov_b64 s[6:7], exec 9581; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9582; GFX7-NEXT: v_readfirstlane_b32 s8, v0 9583; GFX7-NEXT: v_readfirstlane_b32 s9, v1 9584; GFX7-NEXT: v_readfirstlane_b32 s10, v2 9585; GFX7-NEXT: v_readfirstlane_b32 s11, v3 9586; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9587; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9588; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9589; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9590; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 9591; GFX7-NEXT: ; implicit-def: $vgpr4 9592; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 9593; GFX7-NEXT: s_cbranch_execnz .LBB28_1 9594; GFX7-NEXT: ; %bb.2: 9595; GFX7-NEXT: s_mov_b64 exec, s[6:7] 9596; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9597; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9598; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 9599; GFX7-NEXT: s_waitcnt vmcnt(0) 9600; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 9601; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 9602; GFX7-NEXT: s_mov_b64 s[6:7], 0 9603; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 9604; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start 9605; GFX7-NEXT: ; =>This Loop Header: Depth=1 9606; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 9607; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 9608; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 9609; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 9610; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 9611; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 9612; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 9613; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 9614; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 9615; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 9616; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 9617; GFX7-NEXT: v_mov_b32_e32 v7, v5 9618; GFX7-NEXT: s_mov_b64 s[12:13], exec 9619; GFX7-NEXT: v_mov_b32_e32 v6, v4 9620; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9621; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 9622; GFX7-NEXT: v_readfirstlane_b32 s8, v0 9623; GFX7-NEXT: v_readfirstlane_b32 s9, v1 9624; GFX7-NEXT: v_readfirstlane_b32 s10, v2 9625; GFX7-NEXT: v_readfirstlane_b32 s11, v3 9626; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9627; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9628; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9629; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9630; GFX7-NEXT: s_waitcnt vmcnt(0) 9631; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc 9632; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 9633; GFX7-NEXT: s_cbranch_execnz .LBB28_4 9634; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9635; GFX7-NEXT: s_mov_b64 exec, s[12:13] 9636; GFX7-NEXT: s_waitcnt vmcnt(0) 9637; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 9638; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 9639; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9640; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 9641; GFX7-NEXT: buffer_wbinvl1 9642; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 9643; GFX7-NEXT: s_cbranch_execnz .LBB28_3 9644; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 9645; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 9646; GFX7-NEXT: v_mov_b32_e32 v0, v7 9647; GFX7-NEXT: v_mov_b32_e32 v1, v4 9648; GFX7-NEXT: s_setpc_b64 s[30:31] 9649; 9650; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 9651; GFX6: ; %bb.0: 9652; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9653; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 9654; GFX6-NEXT: s_mov_b64 s[6:7], exec 9655; GFX6-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 9656; GFX6-NEXT: v_readfirstlane_b32 s8, v0 9657; GFX6-NEXT: v_readfirstlane_b32 s9, v1 9658; GFX6-NEXT: v_readfirstlane_b32 s10, v2 9659; GFX6-NEXT: v_readfirstlane_b32 s11, v3 9660; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9661; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9662; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9663; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9664; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 9665; GFX6-NEXT: ; implicit-def: $vgpr4 9666; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 9667; GFX6-NEXT: s_cbranch_execnz .LBB28_1 9668; GFX6-NEXT: ; %bb.2: 9669; GFX6-NEXT: s_mov_b64 exec, s[6:7] 9670; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 9671; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 9672; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 9673; GFX6-NEXT: s_waitcnt vmcnt(0) 9674; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 9675; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 9676; GFX6-NEXT: s_mov_b64 s[6:7], 0 9677; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 9678; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start 9679; GFX6-NEXT: ; =>This Loop Header: Depth=1 9680; GFX6-NEXT: ; Child Loop BB28_4 Depth 2 9681; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 9682; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 9683; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 9684; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 9685; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 9686; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 9687; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 9688; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 9689; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 9690; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 9691; GFX6-NEXT: v_mov_b32_e32 v7, v5 9692; GFX6-NEXT: s_mov_b64 s[12:13], exec 9693; GFX6-NEXT: v_mov_b32_e32 v6, v4 9694; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 9695; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 9696; GFX6-NEXT: v_readfirstlane_b32 s8, v0 9697; GFX6-NEXT: v_readfirstlane_b32 s9, v1 9698; GFX6-NEXT: v_readfirstlane_b32 s10, v2 9699; GFX6-NEXT: v_readfirstlane_b32 s11, v3 9700; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 9701; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 9702; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 9703; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 9704; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 9705; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc 9706; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 9707; GFX6-NEXT: s_cbranch_execnz .LBB28_4 9708; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 9709; GFX6-NEXT: s_mov_b64 exec, s[12:13] 9710; GFX6-NEXT: s_waitcnt vmcnt(0) 9711; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 9712; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 9713; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9714; GFX6-NEXT: s_waitcnt expcnt(0) 9715; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 9716; GFX6-NEXT: buffer_wbinvl1 9717; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 9718; GFX6-NEXT: s_cbranch_execnz .LBB28_3 9719; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 9720; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 9721; GFX6-NEXT: v_mov_b32_e32 v0, v7 9722; GFX6-NEXT: v_mov_b32_e32 v1, v4 9723; GFX6-NEXT: s_setpc_b64 s[30:31] 9724 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 9725 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9726 ret <2 x bfloat> %result 9727} 9728 9729define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { 9730; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9731; GFX12: ; %bb.0: 9732; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9733; GFX12-NEXT: s_wait_expcnt 0x0 9734; GFX12-NEXT: s_wait_samplecnt 0x0 9735; GFX12-NEXT: s_wait_bvhcnt 0x0 9736; GFX12-NEXT: s_wait_kmcnt 0x0 9737; GFX12-NEXT: v_mov_b32_e32 v1, s16 9738; GFX12-NEXT: s_wait_storecnt 0x0 9739; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 9740; GFX12-NEXT: s_wait_loadcnt 0x0 9741; GFX12-NEXT: global_inv scope:SCOPE_DEV 9742; GFX12-NEXT: s_setpc_b64 s[30:31] 9743; 9744; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9745; GFX940: ; %bb.0: 9746; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9747; GFX940-NEXT: v_mov_b32_e32 v1, v0 9748; GFX940-NEXT: v_mov_b32_e32 v0, s16 9749; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 9750; GFX940-NEXT: s_add_i32 s4, s16, 0x400 9751; GFX940-NEXT: s_mov_b64 s[6:7], 0 9752; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9753; GFX940-NEXT: s_movk_i32 s8, 0x7fff 9754; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9755; GFX940-NEXT: s_mov_b32 s9, 0x7060302 9756; GFX940-NEXT: v_mov_b32_e32 v4, s4 9757; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start 9758; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9759; GFX940-NEXT: s_waitcnt vmcnt(0) 9760; GFX940-NEXT: v_mov_b32_e32 v7, v0 9761; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 9762; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 9763; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 9764; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 9765; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 9766; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 9767; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 9768; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 9769; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 9770; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 9771; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9772; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 9773; GFX940-NEXT: buffer_wbl2 sc1 9774; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 9775; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 9776; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 9777; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] 9778; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 9779; GFX940-NEXT: s_waitcnt vmcnt(0) 9780; GFX940-NEXT: buffer_inv sc1 9781; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 9782; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9783; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 9784; GFX940-NEXT: s_cbranch_execnz .LBB29_1 9785; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9786; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 9787; GFX940-NEXT: s_setpc_b64 s[30:31] 9788; 9789; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9790; GFX11: ; %bb.0: 9791; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9792; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 9793; GFX11-NEXT: s_add_i32 s4, s16, 0x400 9794; GFX11-NEXT: s_mov_b32 s5, 0 9795; GFX11-NEXT: v_mov_b32_e32 v4, s4 9796; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 9797; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9798; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 9799; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9800; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 9801; GFX11-NEXT: .p2align 6 9802; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start 9803; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9804; GFX11-NEXT: s_waitcnt vmcnt(0) 9805; GFX11-NEXT: v_mov_b32_e32 v6, v0 9806; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9807; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9808; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 9809; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 9810; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 9811; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 9812; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 9813; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9814; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 9815; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9816; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 9817; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 9818; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 9819; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 9820; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 9821; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 9822; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 9823; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9824; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 9825; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 9826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9827; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 9828; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc 9829; GFX11-NEXT: s_waitcnt vmcnt(0) 9830; GFX11-NEXT: buffer_gl1_inv 9831; GFX11-NEXT: buffer_gl0_inv 9832; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 9833; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 9834; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9835; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 9836; GFX11-NEXT: s_cbranch_execnz .LBB29_1 9837; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9838; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 9839; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 9840; GFX11-NEXT: s_setpc_b64 s[30:31] 9841; 9842; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9843; GFX10: ; %bb.0: 9844; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9845; GFX10-NEXT: v_mov_b32_e32 v1, v0 9846; GFX10-NEXT: v_mov_b32_e32 v0, s20 9847; GFX10-NEXT: s_add_i32 s4, s20, 0x400 9848; GFX10-NEXT: s_mov_b32 s5, 0 9849; GFX10-NEXT: v_mov_b32_e32 v4, s4 9850; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9851; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 9852; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9853; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start 9854; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9855; GFX10-NEXT: s_waitcnt vmcnt(0) 9856; GFX10-NEXT: v_mov_b32_e32 v6, v0 9857; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9858; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 9859; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 9860; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 9861; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 9862; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 9863; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 9864; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 9865; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 9866; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9867; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 9868; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 9869; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 9870; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo 9871; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 9872; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 9873; GFX10-NEXT: v_mov_b32_e32 v0, v5 9874; GFX10-NEXT: v_mov_b32_e32 v1, v6 9875; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 9876; GFX10-NEXT: s_waitcnt vmcnt(0) 9877; GFX10-NEXT: buffer_gl1_inv 9878; GFX10-NEXT: buffer_gl0_inv 9879; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 9880; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 9881; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 9882; GFX10-NEXT: s_cbranch_execnz .LBB29_1 9883; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9884; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 9885; GFX10-NEXT: s_setpc_b64 s[30:31] 9886; 9887; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9888; GFX90A: ; %bb.0: 9889; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9890; GFX90A-NEXT: v_mov_b32_e32 v1, v0 9891; GFX90A-NEXT: v_mov_b32_e32 v0, s20 9892; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 9893; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 9894; GFX90A-NEXT: s_mov_b64 s[6:7], 0 9895; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9896; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 9897; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9898; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 9899; GFX90A-NEXT: v_mov_b32_e32 v4, s4 9900; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start 9901; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9902; GFX90A-NEXT: s_waitcnt vmcnt(0) 9903; GFX90A-NEXT: v_mov_b32_e32 v7, v0 9904; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 9905; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 9906; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 9907; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 9908; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 9909; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 9910; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 9911; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 9912; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 9913; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 9914; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9915; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 9916; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 9917; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 9918; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 9919; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] 9920; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 9921; GFX90A-NEXT: s_waitcnt vmcnt(0) 9922; GFX90A-NEXT: buffer_wbinvl1 9923; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 9924; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9925; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 9926; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 9927; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9928; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 9929; GFX90A-NEXT: s_setpc_b64 s[30:31] 9930; 9931; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9932; GFX908: ; %bb.0: 9933; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9934; GFX908-NEXT: v_mov_b32_e32 v1, v0 9935; GFX908-NEXT: v_mov_b32_e32 v0, s20 9936; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 9937; GFX908-NEXT: s_add_i32 s4, s20, 0x400 9938; GFX908-NEXT: s_mov_b64 s[6:7], 0 9939; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9940; GFX908-NEXT: s_movk_i32 s8, 0x7fff 9941; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9942; GFX908-NEXT: s_mov_b32 s9, 0x7060302 9943; GFX908-NEXT: v_mov_b32_e32 v4, s4 9944; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start 9945; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9946; GFX908-NEXT: s_waitcnt vmcnt(0) 9947; GFX908-NEXT: v_mov_b32_e32 v6, v0 9948; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 9949; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 9950; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 9951; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 9952; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 9953; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 9954; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 9955; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 9956; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 9957; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 9958; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9959; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 9960; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 9961; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 9962; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 9963; GFX908-NEXT: v_mov_b32_e32 v0, v5 9964; GFX908-NEXT: v_mov_b32_e32 v1, v6 9965; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 9966; GFX908-NEXT: s_waitcnt vmcnt(0) 9967; GFX908-NEXT: buffer_wbinvl1 9968; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 9969; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 9970; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 9971; GFX908-NEXT: s_cbranch_execnz .LBB29_1 9972; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9973; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 9974; GFX908-NEXT: s_setpc_b64 s[30:31] 9975; 9976; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 9977; GFX8: ; %bb.0: 9978; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9979; GFX8-NEXT: v_mov_b32_e32 v1, v0 9980; GFX8-NEXT: v_mov_b32_e32 v0, s20 9981; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 9982; GFX8-NEXT: s_add_i32 s4, s20, 0x400 9983; GFX8-NEXT: s_mov_b64 s[6:7], 0 9984; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9985; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 9986; GFX8-NEXT: v_mov_b32_e32 v4, s4 9987; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start 9988; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9989; GFX8-NEXT: s_waitcnt vmcnt(0) 9990; GFX8-NEXT: v_mov_b32_e32 v6, v0 9991; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 9992; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 9993; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 9994; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 9995; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 9996; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 9997; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 9998; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 9999; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 10000; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10001; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 10002; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10003; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 10004; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10005; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 10006; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 10007; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10008; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 10009; GFX8-NEXT: v_mov_b32_e32 v0, v5 10010; GFX8-NEXT: v_mov_b32_e32 v1, v6 10011; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 10012; GFX8-NEXT: s_waitcnt vmcnt(0) 10013; GFX8-NEXT: buffer_wbinvl1 10014; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 10015; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10016; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 10017; GFX8-NEXT: s_cbranch_execnz .LBB29_1 10018; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10019; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 10020; GFX8-NEXT: s_setpc_b64 s[30:31] 10021; 10022; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 10023; GFX7: ; %bb.0: 10024; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10025; GFX7-NEXT: v_mov_b32_e32 v2, s20 10026; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 10027; GFX7-NEXT: s_add_i32 s6, s20, 0x400 10028; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10029; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10030; GFX7-NEXT: s_mov_b64 s[4:5], 0 10031; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 10032; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10033; GFX7-NEXT: s_waitcnt vmcnt(0) 10034; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 10035; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 10036; GFX7-NEXT: v_mov_b32_e32 v4, s6 10037; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start 10038; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10039; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10040; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10041; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 10042; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 10043; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10044; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 10045; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 10046; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 10047; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 10048; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 10049; GFX7-NEXT: v_mov_b32_e32 v6, v1 10050; GFX7-NEXT: v_mov_b32_e32 v5, v0 10051; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10052; GFX7-NEXT: s_waitcnt vmcnt(0) 10053; GFX7-NEXT: buffer_wbinvl1 10054; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10055; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 10056; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10057; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 10058; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10059; GFX7-NEXT: s_cbranch_execnz .LBB29_1 10060; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10061; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10062; GFX7-NEXT: s_setpc_b64 s[30:31] 10063; 10064; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: 10065; GFX6: ; %bb.0: 10066; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10067; GFX6-NEXT: v_mov_b32_e32 v2, s20 10068; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 10069; GFX6-NEXT: s_add_i32 s6, s20, 0x400 10070; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 10071; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 10072; GFX6-NEXT: s_mov_b64 s[4:5], 0 10073; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 10074; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10075; GFX6-NEXT: s_waitcnt vmcnt(0) 10076; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 10077; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 10078; GFX6-NEXT: v_mov_b32_e32 v4, s6 10079; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start 10080; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 10081; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 10082; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 10083; GFX6-NEXT: s_waitcnt expcnt(0) 10084; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 10085; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 10086; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10087; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 10088; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 10089; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 10090; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 10091; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 10092; GFX6-NEXT: v_mov_b32_e32 v6, v1 10093; GFX6-NEXT: v_mov_b32_e32 v5, v0 10094; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10095; GFX6-NEXT: s_waitcnt vmcnt(0) 10096; GFX6-NEXT: buffer_wbinvl1 10097; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10098; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 10099; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10100; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 10101; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 10102; GFX6-NEXT: s_cbranch_execnz .LBB29_1 10103; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 10104; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 10105; GFX6-NEXT: s_waitcnt expcnt(0) 10106; GFX6-NEXT: s_setpc_b64 s[30:31] 10107 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 10108 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst 10109 ret <2 x bfloat> %result 10110} 10111 10112define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { 10113; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10114; GFX12: ; %bb.0: 10115; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10116; GFX12-NEXT: s_wait_expcnt 0x0 10117; GFX12-NEXT: s_wait_samplecnt 0x0 10118; GFX12-NEXT: s_wait_bvhcnt 0x0 10119; GFX12-NEXT: s_wait_kmcnt 0x0 10120; GFX12-NEXT: v_mov_b32_e32 v1, s16 10121; GFX12-NEXT: s_wait_storecnt 0x0 10122; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 10123; GFX12-NEXT: s_wait_storecnt 0x0 10124; GFX12-NEXT: global_inv scope:SCOPE_DEV 10125; GFX12-NEXT: s_setpc_b64 s[30:31] 10126; 10127; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10128; GFX940: ; %bb.0: 10129; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10130; GFX940-NEXT: v_mov_b32_e32 v1, s16 10131; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 10132; GFX940-NEXT: s_add_i32 s4, s16, 0x400 10133; GFX940-NEXT: s_mov_b64 s[6:7], 0 10134; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10135; GFX940-NEXT: s_movk_i32 s8, 0x7fff 10136; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10137; GFX940-NEXT: s_mov_b32 s9, 0x7060302 10138; GFX940-NEXT: v_mov_b32_e32 v4, s4 10139; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start 10140; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10141; GFX940-NEXT: s_waitcnt vmcnt(0) 10142; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10143; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10144; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 10145; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 10146; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 10147; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 10148; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 10149; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 10150; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 10151; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 10152; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10153; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10154; GFX940-NEXT: buffer_wbl2 sc1 10155; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 10156; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 10157; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 10158; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] 10159; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 10160; GFX940-NEXT: s_waitcnt vmcnt(0) 10161; GFX940-NEXT: buffer_inv sc1 10162; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 10163; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10164; GFX940-NEXT: v_mov_b32_e32 v1, v6 10165; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 10166; GFX940-NEXT: s_cbranch_execnz .LBB30_1 10167; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10168; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 10169; GFX940-NEXT: s_setpc_b64 s[30:31] 10170; 10171; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10172; GFX11: ; %bb.0: 10173; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10174; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 10175; GFX11-NEXT: s_add_i32 s4, s16, 0x400 10176; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10177; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 10178; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 10179; GFX11-NEXT: s_mov_b32 s5, 0 10180; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 10181; GFX11-NEXT: .p2align 6 10182; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start 10183; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10184; GFX11-NEXT: s_waitcnt vmcnt(0) 10185; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10186; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10187; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10188; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10189; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 10190; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 10191; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 10192; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 10193; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 10194; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 10195; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10196; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 10197; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 10198; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 10199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 10200; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 10201; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 10202; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10203; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 10204; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 10205; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc 10206; GFX11-NEXT: s_waitcnt vmcnt(0) 10207; GFX11-NEXT: buffer_gl1_inv 10208; GFX11-NEXT: buffer_gl0_inv 10209; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 10210; GFX11-NEXT: v_mov_b32_e32 v1, v5 10211; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 10212; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10213; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 10214; GFX11-NEXT: s_cbranch_execnz .LBB30_1 10215; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10216; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 10217; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 10218; GFX11-NEXT: s_setpc_b64 s[30:31] 10219; 10220; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10221; GFX10: ; %bb.0: 10222; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10223; GFX10-NEXT: v_mov_b32_e32 v1, s20 10224; GFX10-NEXT: s_add_i32 s4, s20, 0x400 10225; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10226; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10227; GFX10-NEXT: v_mov_b32_e32 v4, s4 10228; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 10229; GFX10-NEXT: s_mov_b32 s5, 0 10230; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start 10231; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10232; GFX10-NEXT: s_waitcnt vmcnt(0) 10233; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10234; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10235; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10236; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 10237; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 10238; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 10239; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 10240; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 10241; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 10242; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10243; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 10244; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 10245; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 10246; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 10247; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 10248; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 10249; GFX10-NEXT: v_mov_b32_e32 v6, v1 10250; GFX10-NEXT: v_mov_b32_e32 v5, v0 10251; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10252; GFX10-NEXT: s_waitcnt vmcnt(0) 10253; GFX10-NEXT: buffer_gl1_inv 10254; GFX10-NEXT: buffer_gl0_inv 10255; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 10256; GFX10-NEXT: v_mov_b32_e32 v1, v5 10257; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 10258; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 10259; GFX10-NEXT: s_cbranch_execnz .LBB30_1 10260; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10261; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 10262; GFX10-NEXT: s_setpc_b64 s[30:31] 10263; 10264; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10265; GFX90A: ; %bb.0: 10266; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10267; GFX90A-NEXT: v_mov_b32_e32 v1, s20 10268; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 10269; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 10270; GFX90A-NEXT: s_mov_b64 s[6:7], 0 10271; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10272; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 10273; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10274; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 10275; GFX90A-NEXT: v_mov_b32_e32 v4, s4 10276; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start 10277; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10278; GFX90A-NEXT: s_waitcnt vmcnt(0) 10279; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10280; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10281; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 10282; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 10283; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 10284; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 10285; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 10286; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 10287; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 10288; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 10289; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10290; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10291; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 10292; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 10293; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 10294; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] 10295; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc 10296; GFX90A-NEXT: s_waitcnt vmcnt(0) 10297; GFX90A-NEXT: buffer_wbinvl1 10298; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 10299; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10300; GFX90A-NEXT: v_mov_b32_e32 v1, v6 10301; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 10302; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 10303; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10304; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 10305; GFX90A-NEXT: s_setpc_b64 s[30:31] 10306; 10307; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10308; GFX908: ; %bb.0: 10309; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10310; GFX908-NEXT: v_mov_b32_e32 v1, s20 10311; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 10312; GFX908-NEXT: s_add_i32 s4, s20, 0x400 10313; GFX908-NEXT: s_mov_b64 s[6:7], 0 10314; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10315; GFX908-NEXT: s_movk_i32 s8, 0x7fff 10316; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10317; GFX908-NEXT: s_mov_b32 s9, 0x7060302 10318; GFX908-NEXT: v_mov_b32_e32 v4, s4 10319; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start 10320; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10321; GFX908-NEXT: s_waitcnt vmcnt(0) 10322; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10323; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10324; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 10325; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 10326; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 10327; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 10328; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 10329; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 10330; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 10331; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 10332; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10333; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10334; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 10335; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 10336; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 10337; GFX908-NEXT: v_mov_b32_e32 v6, v1 10338; GFX908-NEXT: v_mov_b32_e32 v5, v0 10339; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10340; GFX908-NEXT: s_waitcnt vmcnt(0) 10341; GFX908-NEXT: buffer_wbinvl1 10342; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10343; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10344; GFX908-NEXT: v_mov_b32_e32 v1, v5 10345; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 10346; GFX908-NEXT: s_cbranch_execnz .LBB30_1 10347; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10348; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 10349; GFX908-NEXT: s_setpc_b64 s[30:31] 10350; 10351; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10352; GFX8: ; %bb.0: 10353; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10354; GFX8-NEXT: v_mov_b32_e32 v1, s20 10355; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 10356; GFX8-NEXT: s_add_i32 s4, s20, 0x400 10357; GFX8-NEXT: s_mov_b64 s[6:7], 0 10358; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10359; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10360; GFX8-NEXT: v_mov_b32_e32 v4, s4 10361; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start 10362; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10363; GFX8-NEXT: s_waitcnt vmcnt(0) 10364; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10365; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10366; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 10367; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 10368; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 10369; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 10370; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 10371; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 10372; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 10373; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10374; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 10375; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10376; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 10377; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10378; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 10379; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 10380; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 10381; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 10382; GFX8-NEXT: v_mov_b32_e32 v6, v1 10383; GFX8-NEXT: v_mov_b32_e32 v5, v0 10384; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10385; GFX8-NEXT: s_waitcnt vmcnt(0) 10386; GFX8-NEXT: buffer_wbinvl1 10387; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10388; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10389; GFX8-NEXT: v_mov_b32_e32 v1, v5 10390; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 10391; GFX8-NEXT: s_cbranch_execnz .LBB30_1 10392; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10393; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 10394; GFX8-NEXT: s_setpc_b64 s[30:31] 10395; 10396; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10397; GFX7: ; %bb.0: 10398; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10399; GFX7-NEXT: v_mov_b32_e32 v2, s20 10400; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 10401; GFX7-NEXT: s_add_i32 s6, s20, 0x400 10402; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10403; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10404; GFX7-NEXT: s_mov_b64 s[4:5], 0 10405; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10406; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10407; GFX7-NEXT: s_waitcnt vmcnt(0) 10408; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 10409; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 10410; GFX7-NEXT: v_mov_b32_e32 v2, s6 10411; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start 10412; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10413; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 10414; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 10415; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 10416; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 10417; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10418; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 10419; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 10420; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 10421; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 10422; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 10423; GFX7-NEXT: v_mov_b32_e32 v6, v4 10424; GFX7-NEXT: v_mov_b32_e32 v5, v3 10425; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 10426; GFX7-NEXT: s_waitcnt vmcnt(0) 10427; GFX7-NEXT: buffer_wbinvl1 10428; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 10429; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 10430; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10431; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 10432; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10433; GFX7-NEXT: s_cbranch_execnz .LBB30_1 10434; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10435; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10436; GFX7-NEXT: s_setpc_b64 s[30:31] 10437; 10438; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: 10439; GFX6: ; %bb.0: 10440; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10441; GFX6-NEXT: v_mov_b32_e32 v2, s20 10442; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 10443; GFX6-NEXT: s_add_i32 s6, s20, 0x400 10444; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 10445; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 10446; GFX6-NEXT: s_mov_b64 s[4:5], 0 10447; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10448; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10449; GFX6-NEXT: s_waitcnt vmcnt(0) 10450; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 10451; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 10452; GFX6-NEXT: v_mov_b32_e32 v2, s6 10453; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start 10454; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 10455; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 10456; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 10457; GFX6-NEXT: s_waitcnt expcnt(0) 10458; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 10459; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 10460; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10461; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 10462; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 10463; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 10464; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 10465; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 10466; GFX6-NEXT: v_mov_b32_e32 v6, v4 10467; GFX6-NEXT: v_mov_b32_e32 v5, v3 10468; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 10469; GFX6-NEXT: s_waitcnt vmcnt(0) 10470; GFX6-NEXT: buffer_wbinvl1 10471; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 10472; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 10473; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10474; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 10475; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 10476; GFX6-NEXT: s_cbranch_execnz .LBB30_1 10477; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 10478; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 10479; GFX6-NEXT: s_waitcnt expcnt(0) 10480; GFX6-NEXT: s_setpc_b64 s[30:31] 10481 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 10482 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst 10483 ret void 10484} 10485 10486define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 10487; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10488; GFX12: ; %bb.0: 10489; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10490; GFX12-NEXT: s_wait_expcnt 0x0 10491; GFX12-NEXT: s_wait_samplecnt 0x0 10492; GFX12-NEXT: s_wait_bvhcnt 0x0 10493; GFX12-NEXT: s_wait_kmcnt 0x0 10494; GFX12-NEXT: v_mov_b32_e32 v1, s16 10495; GFX12-NEXT: s_wait_storecnt 0x0 10496; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 10497; GFX12-NEXT: s_wait_loadcnt 0x0 10498; GFX12-NEXT: global_inv scope:SCOPE_DEV 10499; GFX12-NEXT: s_setpc_b64 s[30:31] 10500; 10501; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10502; GFX940: ; %bb.0: 10503; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10504; GFX940-NEXT: v_mov_b32_e32 v1, v0 10505; GFX940-NEXT: v_mov_b32_e32 v0, s16 10506; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 10507; GFX940-NEXT: s_add_i32 s4, s16, 0x400 10508; GFX940-NEXT: s_mov_b64 s[6:7], 0 10509; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10510; GFX940-NEXT: s_movk_i32 s8, 0x7fff 10511; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10512; GFX940-NEXT: s_mov_b32 s9, 0x7060302 10513; GFX940-NEXT: v_mov_b32_e32 v4, s4 10514; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start 10515; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10516; GFX940-NEXT: s_waitcnt vmcnt(0) 10517; GFX940-NEXT: v_mov_b32_e32 v7, v0 10518; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 10519; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 10520; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 10521; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 10522; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 10523; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 10524; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 10525; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 10526; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 10527; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 10528; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10529; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10530; GFX940-NEXT: buffer_wbl2 sc1 10531; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 10532; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 10533; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 10534; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] 10535; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 10536; GFX940-NEXT: s_waitcnt vmcnt(0) 10537; GFX940-NEXT: buffer_inv sc1 10538; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 10539; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10540; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 10541; GFX940-NEXT: s_cbranch_execnz .LBB31_1 10542; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10543; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 10544; GFX940-NEXT: s_setpc_b64 s[30:31] 10545; 10546; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10547; GFX11: ; %bb.0: 10548; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10549; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 10550; GFX11-NEXT: s_add_i32 s4, s16, 0x400 10551; GFX11-NEXT: s_mov_b32 s5, 0 10552; GFX11-NEXT: v_mov_b32_e32 v4, s4 10553; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 10554; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10555; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 10556; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10557; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 10558; GFX11-NEXT: .p2align 6 10559; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start 10560; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10561; GFX11-NEXT: s_waitcnt vmcnt(0) 10562; GFX11-NEXT: v_mov_b32_e32 v6, v0 10563; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10564; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10565; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 10566; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 10567; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10568; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 10569; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 10570; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 10571; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 10572; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10573; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 10574; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 10575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10576; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 10577; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 10578; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 10579; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 10580; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10581; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 10582; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 10583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10584; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 10585; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc 10586; GFX11-NEXT: s_waitcnt vmcnt(0) 10587; GFX11-NEXT: buffer_gl1_inv 10588; GFX11-NEXT: buffer_gl0_inv 10589; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 10590; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 10591; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10592; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 10593; GFX11-NEXT: s_cbranch_execnz .LBB31_1 10594; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10595; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 10596; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 10597; GFX11-NEXT: s_setpc_b64 s[30:31] 10598; 10599; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10600; GFX10: ; %bb.0: 10601; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10602; GFX10-NEXT: v_mov_b32_e32 v1, v0 10603; GFX10-NEXT: v_mov_b32_e32 v0, s20 10604; GFX10-NEXT: s_add_i32 s4, s20, 0x400 10605; GFX10-NEXT: s_mov_b32 s5, 0 10606; GFX10-NEXT: v_mov_b32_e32 v4, s4 10607; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10608; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 10609; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10610; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start 10611; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10612; GFX10-NEXT: s_waitcnt vmcnt(0) 10613; GFX10-NEXT: v_mov_b32_e32 v6, v0 10614; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10615; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 10616; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 10617; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 10618; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 10619; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 10620; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 10621; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 10622; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 10623; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 10624; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 10625; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 10626; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 10627; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo 10628; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 10629; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 10630; GFX10-NEXT: v_mov_b32_e32 v0, v5 10631; GFX10-NEXT: v_mov_b32_e32 v1, v6 10632; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 10633; GFX10-NEXT: s_waitcnt vmcnt(0) 10634; GFX10-NEXT: buffer_gl1_inv 10635; GFX10-NEXT: buffer_gl0_inv 10636; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 10637; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 10638; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 10639; GFX10-NEXT: s_cbranch_execnz .LBB31_1 10640; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10641; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 10642; GFX10-NEXT: s_setpc_b64 s[30:31] 10643; 10644; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10645; GFX90A: ; %bb.0: 10646; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10647; GFX90A-NEXT: v_mov_b32_e32 v1, v0 10648; GFX90A-NEXT: v_mov_b32_e32 v0, s20 10649; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 10650; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 10651; GFX90A-NEXT: s_mov_b64 s[6:7], 0 10652; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10653; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 10654; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10655; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 10656; GFX90A-NEXT: v_mov_b32_e32 v4, s4 10657; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start 10658; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10659; GFX90A-NEXT: s_waitcnt vmcnt(0) 10660; GFX90A-NEXT: v_mov_b32_e32 v7, v0 10661; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 10662; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 10663; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 10664; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 10665; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 10666; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 10667; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 10668; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 10669; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 10670; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 10671; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10672; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10673; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 10674; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 10675; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 10676; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] 10677; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 10678; GFX90A-NEXT: s_waitcnt vmcnt(0) 10679; GFX90A-NEXT: buffer_wbinvl1 10680; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 10681; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10682; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 10683; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 10684; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10685; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 10686; GFX90A-NEXT: s_setpc_b64 s[30:31] 10687; 10688; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10689; GFX908: ; %bb.0: 10690; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10691; GFX908-NEXT: v_mov_b32_e32 v1, v0 10692; GFX908-NEXT: v_mov_b32_e32 v0, s20 10693; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 10694; GFX908-NEXT: s_add_i32 s4, s20, 0x400 10695; GFX908-NEXT: s_mov_b64 s[6:7], 0 10696; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10697; GFX908-NEXT: s_movk_i32 s8, 0x7fff 10698; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10699; GFX908-NEXT: s_mov_b32 s9, 0x7060302 10700; GFX908-NEXT: v_mov_b32_e32 v4, s4 10701; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start 10702; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10703; GFX908-NEXT: s_waitcnt vmcnt(0) 10704; GFX908-NEXT: v_mov_b32_e32 v6, v0 10705; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 10706; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 10707; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 10708; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 10709; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 10710; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 10711; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 10712; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 10713; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 10714; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 10715; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10716; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10717; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 10718; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 10719; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 10720; GFX908-NEXT: v_mov_b32_e32 v0, v5 10721; GFX908-NEXT: v_mov_b32_e32 v1, v6 10722; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 10723; GFX908-NEXT: s_waitcnt vmcnt(0) 10724; GFX908-NEXT: buffer_wbinvl1 10725; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 10726; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10727; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 10728; GFX908-NEXT: s_cbranch_execnz .LBB31_1 10729; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10730; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 10731; GFX908-NEXT: s_setpc_b64 s[30:31] 10732; 10733; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10734; GFX8: ; %bb.0: 10735; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10736; GFX8-NEXT: v_mov_b32_e32 v1, v0 10737; GFX8-NEXT: v_mov_b32_e32 v0, s20 10738; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 10739; GFX8-NEXT: s_add_i32 s4, s20, 0x400 10740; GFX8-NEXT: s_mov_b64 s[6:7], 0 10741; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 10742; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10743; GFX8-NEXT: v_mov_b32_e32 v4, s4 10744; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start 10745; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10746; GFX8-NEXT: s_waitcnt vmcnt(0) 10747; GFX8-NEXT: v_mov_b32_e32 v6, v0 10748; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 10749; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 10750; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 10751; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 10752; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 10753; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 10754; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 10755; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 10756; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 10757; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10758; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 10759; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10760; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 10761; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10762; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 10763; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 10764; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10765; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 10766; GFX8-NEXT: v_mov_b32_e32 v0, v5 10767; GFX8-NEXT: v_mov_b32_e32 v1, v6 10768; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 10769; GFX8-NEXT: s_waitcnt vmcnt(0) 10770; GFX8-NEXT: buffer_wbinvl1 10771; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 10772; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10773; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 10774; GFX8-NEXT: s_cbranch_execnz .LBB31_1 10775; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10776; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 10777; GFX8-NEXT: s_setpc_b64 s[30:31] 10778; 10779; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10780; GFX7: ; %bb.0: 10781; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10782; GFX7-NEXT: v_mov_b32_e32 v2, s20 10783; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 10784; GFX7-NEXT: s_add_i32 s6, s20, 0x400 10785; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10786; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10787; GFX7-NEXT: s_mov_b64 s[4:5], 0 10788; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 10789; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10790; GFX7-NEXT: s_waitcnt vmcnt(0) 10791; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 10792; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 10793; GFX7-NEXT: v_mov_b32_e32 v4, s6 10794; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start 10795; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10796; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10797; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10798; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 10799; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 10800; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10801; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 10802; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 10803; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 10804; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 10805; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 10806; GFX7-NEXT: v_mov_b32_e32 v6, v1 10807; GFX7-NEXT: v_mov_b32_e32 v5, v0 10808; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10809; GFX7-NEXT: s_waitcnt vmcnt(0) 10810; GFX7-NEXT: buffer_wbinvl1 10811; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10812; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 10813; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10814; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 10815; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10816; GFX7-NEXT: s_cbranch_execnz .LBB31_1 10817; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10818; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10819; GFX7-NEXT: s_setpc_b64 s[30:31] 10820; 10821; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: 10822; GFX6: ; %bb.0: 10823; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10824; GFX6-NEXT: v_mov_b32_e32 v2, s20 10825; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 10826; GFX6-NEXT: s_add_i32 s6, s20, 0x400 10827; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 10828; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 10829; GFX6-NEXT: s_mov_b64 s[4:5], 0 10830; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 10831; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 10832; GFX6-NEXT: s_waitcnt vmcnt(0) 10833; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 10834; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 10835; GFX6-NEXT: v_mov_b32_e32 v4, s6 10836; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start 10837; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 10838; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 10839; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 10840; GFX6-NEXT: s_waitcnt expcnt(0) 10841; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 10842; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 10843; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10844; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 10845; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 10846; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 10847; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 10848; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 10849; GFX6-NEXT: v_mov_b32_e32 v6, v1 10850; GFX6-NEXT: v_mov_b32_e32 v5, v0 10851; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 10852; GFX6-NEXT: s_waitcnt vmcnt(0) 10853; GFX6-NEXT: buffer_wbinvl1 10854; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 10855; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 10856; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10857; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 10858; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 10859; GFX6-NEXT: s_cbranch_execnz .LBB31_1 10860; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 10861; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 10862; GFX6-NEXT: s_waitcnt expcnt(0) 10863; GFX6-NEXT: s_setpc_b64 s[30:31] 10864 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 10865 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 10866 ret <2 x bfloat> %result 10867} 10868 10869define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 10870; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 10871; GFX12: ; %bb.0: 10872; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10873; GFX12-NEXT: s_wait_expcnt 0x0 10874; GFX12-NEXT: s_wait_samplecnt 0x0 10875; GFX12-NEXT: s_wait_bvhcnt 0x0 10876; GFX12-NEXT: s_wait_kmcnt 0x0 10877; GFX12-NEXT: v_mov_b32_e32 v1, s16 10878; GFX12-NEXT: s_wait_storecnt 0x0 10879; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 10880; GFX12-NEXT: s_wait_storecnt 0x0 10881; GFX12-NEXT: global_inv scope:SCOPE_DEV 10882; GFX12-NEXT: s_setpc_b64 s[30:31] 10883; 10884; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 10885; GFX940: ; %bb.0: 10886; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10887; GFX940-NEXT: v_mov_b32_e32 v1, s16 10888; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 10889; GFX940-NEXT: s_add_i32 s4, s16, 0x400 10890; GFX940-NEXT: s_mov_b64 s[6:7], 0 10891; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10892; GFX940-NEXT: s_movk_i32 s8, 0x7fff 10893; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10894; GFX940-NEXT: s_mov_b32 s9, 0x7060302 10895; GFX940-NEXT: v_mov_b32_e32 v4, s4 10896; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start 10897; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10898; GFX940-NEXT: s_waitcnt vmcnt(0) 10899; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10900; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10901; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 10902; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 10903; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 10904; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 10905; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 10906; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 10907; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 10908; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 10909; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10910; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 10911; GFX940-NEXT: buffer_wbl2 sc1 10912; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 10913; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 10914; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 10915; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] 10916; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 10917; GFX940-NEXT: s_waitcnt vmcnt(0) 10918; GFX940-NEXT: buffer_inv sc1 10919; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 10920; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 10921; GFX940-NEXT: v_mov_b32_e32 v1, v6 10922; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 10923; GFX940-NEXT: s_cbranch_execnz .LBB32_1 10924; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10925; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 10926; GFX940-NEXT: s_setpc_b64 s[30:31] 10927; 10928; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 10929; GFX11: ; %bb.0: 10930; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10931; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 10932; GFX11-NEXT: s_add_i32 s4, s16, 0x400 10933; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10934; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 10935; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 10936; GFX11-NEXT: s_mov_b32 s5, 0 10937; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 10938; GFX11-NEXT: .p2align 6 10939; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start 10940; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10941; GFX11-NEXT: s_waitcnt vmcnt(0) 10942; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10943; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10944; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10945; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10946; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 10947; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 10948; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 10949; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 10950; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 10951; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 10952; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10953; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 10954; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 10955; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 10956; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 10957; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 10958; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 10959; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10960; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 10961; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 10962; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc 10963; GFX11-NEXT: s_waitcnt vmcnt(0) 10964; GFX11-NEXT: buffer_gl1_inv 10965; GFX11-NEXT: buffer_gl0_inv 10966; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 10967; GFX11-NEXT: v_mov_b32_e32 v1, v5 10968; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 10969; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10970; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 10971; GFX11-NEXT: s_cbranch_execnz .LBB32_1 10972; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10973; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 10974; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 10975; GFX11-NEXT: s_setpc_b64 s[30:31] 10976; 10977; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 10978; GFX10: ; %bb.0: 10979; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10980; GFX10-NEXT: v_mov_b32_e32 v1, s20 10981; GFX10-NEXT: s_add_i32 s4, s20, 0x400 10982; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 10983; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 10984; GFX10-NEXT: v_mov_b32_e32 v4, s4 10985; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 10986; GFX10-NEXT: s_mov_b32 s5, 0 10987; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start 10988; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10989; GFX10-NEXT: s_waitcnt vmcnt(0) 10990; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 10991; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 10992; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10993; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 10994; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 10995; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 10996; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 10997; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 10998; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 10999; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11000; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 11001; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11002; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 11003; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 11004; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 11005; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 11006; GFX10-NEXT: v_mov_b32_e32 v6, v1 11007; GFX10-NEXT: v_mov_b32_e32 v5, v0 11008; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11009; GFX10-NEXT: s_waitcnt vmcnt(0) 11010; GFX10-NEXT: buffer_gl1_inv 11011; GFX10-NEXT: buffer_gl0_inv 11012; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 11013; GFX10-NEXT: v_mov_b32_e32 v1, v5 11014; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 11015; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 11016; GFX10-NEXT: s_cbranch_execnz .LBB32_1 11017; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11018; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 11019; GFX10-NEXT: s_setpc_b64 s[30:31] 11020; 11021; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 11022; GFX90A: ; %bb.0: 11023; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11024; GFX90A-NEXT: v_mov_b32_e32 v1, s20 11025; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11026; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 11027; GFX90A-NEXT: s_mov_b64 s[6:7], 0 11028; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11029; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 11030; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11031; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 11032; GFX90A-NEXT: v_mov_b32_e32 v4, s4 11033; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start 11034; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11035; GFX90A-NEXT: s_waitcnt vmcnt(0) 11036; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11037; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11038; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 11039; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 11040; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 11041; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 11042; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 11043; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 11044; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 11045; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 11046; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11047; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11048; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11049; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11050; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 11051; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] 11052; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc 11053; GFX90A-NEXT: s_waitcnt vmcnt(0) 11054; GFX90A-NEXT: buffer_wbinvl1 11055; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 11056; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11057; GFX90A-NEXT: v_mov_b32_e32 v1, v6 11058; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 11059; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 11060; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11061; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 11062; GFX90A-NEXT: s_setpc_b64 s[30:31] 11063; 11064; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 11065; GFX908: ; %bb.0: 11066; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11067; GFX908-NEXT: v_mov_b32_e32 v1, s20 11068; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11069; GFX908-NEXT: s_add_i32 s4, s20, 0x400 11070; GFX908-NEXT: s_mov_b64 s[6:7], 0 11071; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11072; GFX908-NEXT: s_movk_i32 s8, 0x7fff 11073; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11074; GFX908-NEXT: s_mov_b32 s9, 0x7060302 11075; GFX908-NEXT: v_mov_b32_e32 v4, s4 11076; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start 11077; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11078; GFX908-NEXT: s_waitcnt vmcnt(0) 11079; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11080; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11081; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 11082; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 11083; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 11084; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 11085; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 11086; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 11087; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 11088; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 11089; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11090; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11091; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11092; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11093; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 11094; GFX908-NEXT: v_mov_b32_e32 v6, v1 11095; GFX908-NEXT: v_mov_b32_e32 v5, v0 11096; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11097; GFX908-NEXT: s_waitcnt vmcnt(0) 11098; GFX908-NEXT: buffer_wbinvl1 11099; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 11100; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11101; GFX908-NEXT: v_mov_b32_e32 v1, v5 11102; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 11103; GFX908-NEXT: s_cbranch_execnz .LBB32_1 11104; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11105; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 11106; GFX908-NEXT: s_setpc_b64 s[30:31] 11107; 11108; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 11109; GFX8: ; %bb.0: 11110; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11111; GFX8-NEXT: v_mov_b32_e32 v1, s20 11112; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11113; GFX8-NEXT: s_add_i32 s4, s20, 0x400 11114; GFX8-NEXT: s_mov_b64 s[6:7], 0 11115; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11116; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11117; GFX8-NEXT: v_mov_b32_e32 v4, s4 11118; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start 11119; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11120; GFX8-NEXT: s_waitcnt vmcnt(0) 11121; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11122; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11123; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 11124; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 11125; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 11126; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11127; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 11128; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11129; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 11130; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11131; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11132; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11133; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 11134; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11135; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11136; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11137; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11138; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 11139; GFX8-NEXT: v_mov_b32_e32 v6, v1 11140; GFX8-NEXT: v_mov_b32_e32 v5, v0 11141; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11142; GFX8-NEXT: s_waitcnt vmcnt(0) 11143; GFX8-NEXT: buffer_wbinvl1 11144; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 11145; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11146; GFX8-NEXT: v_mov_b32_e32 v1, v5 11147; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 11148; GFX8-NEXT: s_cbranch_execnz .LBB32_1 11149; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11150; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 11151; GFX8-NEXT: s_setpc_b64 s[30:31] 11152; 11153; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 11154; GFX7: ; %bb.0: 11155; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11156; GFX7-NEXT: v_mov_b32_e32 v2, s20 11157; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 11158; GFX7-NEXT: s_add_i32 s6, s20, 0x400 11159; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 11160; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 11161; GFX7-NEXT: s_mov_b64 s[4:5], 0 11162; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11163; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11164; GFX7-NEXT: s_waitcnt vmcnt(0) 11165; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 11166; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11167; GFX7-NEXT: v_mov_b32_e32 v2, s6 11168; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start 11169; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11170; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 11171; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 11172; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 11173; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 11174; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11175; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 11176; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 11177; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 11178; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 11179; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 11180; GFX7-NEXT: v_mov_b32_e32 v6, v4 11181; GFX7-NEXT: v_mov_b32_e32 v5, v3 11182; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 11183; GFX7-NEXT: s_waitcnt vmcnt(0) 11184; GFX7-NEXT: buffer_wbinvl1 11185; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 11186; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 11187; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11188; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 11189; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11190; GFX7-NEXT: s_cbranch_execnz .LBB32_1 11191; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11192; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11193; GFX7-NEXT: s_setpc_b64 s[30:31] 11194; 11195; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: 11196; GFX6: ; %bb.0: 11197; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11198; GFX6-NEXT: v_mov_b32_e32 v2, s20 11199; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 11200; GFX6-NEXT: s_add_i32 s6, s20, 0x400 11201; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 11202; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 11203; GFX6-NEXT: s_mov_b64 s[4:5], 0 11204; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11205; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11206; GFX6-NEXT: s_waitcnt vmcnt(0) 11207; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 11208; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11209; GFX6-NEXT: v_mov_b32_e32 v2, s6 11210; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start 11211; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 11212; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 11213; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 11214; GFX6-NEXT: s_waitcnt expcnt(0) 11215; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 11216; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 11217; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11218; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 11219; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 11220; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 11221; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 11222; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 11223; GFX6-NEXT: v_mov_b32_e32 v6, v4 11224; GFX6-NEXT: v_mov_b32_e32 v5, v3 11225; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 11226; GFX6-NEXT: s_waitcnt vmcnt(0) 11227; GFX6-NEXT: buffer_wbinvl1 11228; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 11229; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 11230; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11231; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 11232; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 11233; GFX6-NEXT: s_cbranch_execnz .LBB32_1 11234; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 11235; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 11236; GFX6-NEXT: s_waitcnt expcnt(0) 11237; GFX6-NEXT: s_setpc_b64 s[30:31] 11238 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 11239 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 11240 ret void 11241} 11242 11243define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 11244; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11245; GFX12: ; %bb.0: 11246; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11247; GFX12-NEXT: s_wait_expcnt 0x0 11248; GFX12-NEXT: s_wait_samplecnt 0x0 11249; GFX12-NEXT: s_wait_bvhcnt 0x0 11250; GFX12-NEXT: s_wait_kmcnt 0x0 11251; GFX12-NEXT: v_mov_b32_e32 v1, s16 11252; GFX12-NEXT: s_wait_storecnt 0x0 11253; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 11254; GFX12-NEXT: s_wait_storecnt 0x0 11255; GFX12-NEXT: global_inv scope:SCOPE_DEV 11256; GFX12-NEXT: s_setpc_b64 s[30:31] 11257; 11258; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11259; GFX940: ; %bb.0: 11260; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11261; GFX940-NEXT: v_mov_b32_e32 v1, s16 11262; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 11263; GFX940-NEXT: s_add_i32 s4, s16, 0x400 11264; GFX940-NEXT: s_mov_b64 s[6:7], 0 11265; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11266; GFX940-NEXT: s_movk_i32 s8, 0x7fff 11267; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11268; GFX940-NEXT: s_mov_b32 s9, 0x7060302 11269; GFX940-NEXT: v_mov_b32_e32 v4, s4 11270; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start 11271; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11272; GFX940-NEXT: s_waitcnt vmcnt(0) 11273; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11274; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11275; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 11276; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 11277; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 11278; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 11279; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 11280; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 11281; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 11282; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 11283; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11284; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11285; GFX940-NEXT: buffer_wbl2 sc1 11286; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11287; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11288; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 11289; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] 11290; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 11291; GFX940-NEXT: s_waitcnt vmcnt(0) 11292; GFX940-NEXT: buffer_inv sc1 11293; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 11294; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11295; GFX940-NEXT: v_mov_b32_e32 v1, v6 11296; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 11297; GFX940-NEXT: s_cbranch_execnz .LBB33_1 11298; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11299; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 11300; GFX940-NEXT: s_setpc_b64 s[30:31] 11301; 11302; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11303; GFX11: ; %bb.0: 11304; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11305; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 11306; GFX11-NEXT: s_add_i32 s4, s16, 0x400 11307; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11308; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 11309; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 11310; GFX11-NEXT: s_mov_b32 s5, 0 11311; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 11312; GFX11-NEXT: .p2align 6 11313; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start 11314; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11315; GFX11-NEXT: s_waitcnt vmcnt(0) 11316; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11317; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11318; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11320; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 11321; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11322; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 11323; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 11324; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 11325; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 11326; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11327; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11328; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 11329; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 11330; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 11331; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 11332; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 11333; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11334; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 11335; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 11336; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc 11337; GFX11-NEXT: s_waitcnt vmcnt(0) 11338; GFX11-NEXT: buffer_gl1_inv 11339; GFX11-NEXT: buffer_gl0_inv 11340; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 11341; GFX11-NEXT: v_mov_b32_e32 v1, v5 11342; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 11343; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11344; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 11345; GFX11-NEXT: s_cbranch_execnz .LBB33_1 11346; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11347; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 11348; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 11349; GFX11-NEXT: s_setpc_b64 s[30:31] 11350; 11351; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11352; GFX10: ; %bb.0: 11353; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11354; GFX10-NEXT: v_mov_b32_e32 v1, s20 11355; GFX10-NEXT: s_add_i32 s4, s20, 0x400 11356; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11357; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11358; GFX10-NEXT: v_mov_b32_e32 v4, s4 11359; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11360; GFX10-NEXT: s_mov_b32 s5, 0 11361; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start 11362; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11363; GFX10-NEXT: s_waitcnt vmcnt(0) 11364; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11365; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11366; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11367; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 11368; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 11369; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 11370; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11371; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 11372; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 11373; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11374; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 11375; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11376; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 11377; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 11378; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 11379; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 11380; GFX10-NEXT: v_mov_b32_e32 v6, v1 11381; GFX10-NEXT: v_mov_b32_e32 v5, v0 11382; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11383; GFX10-NEXT: s_waitcnt vmcnt(0) 11384; GFX10-NEXT: buffer_gl1_inv 11385; GFX10-NEXT: buffer_gl0_inv 11386; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 11387; GFX10-NEXT: v_mov_b32_e32 v1, v5 11388; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 11389; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 11390; GFX10-NEXT: s_cbranch_execnz .LBB33_1 11391; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11392; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 11393; GFX10-NEXT: s_setpc_b64 s[30:31] 11394; 11395; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11396; GFX90A: ; %bb.0: 11397; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11398; GFX90A-NEXT: v_mov_b32_e32 v1, s20 11399; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11400; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 11401; GFX90A-NEXT: s_mov_b64 s[6:7], 0 11402; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11403; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 11404; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11405; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 11406; GFX90A-NEXT: v_mov_b32_e32 v4, s4 11407; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start 11408; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11409; GFX90A-NEXT: s_waitcnt vmcnt(0) 11410; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11411; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11412; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 11413; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 11414; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 11415; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 11416; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 11417; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 11418; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 11419; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 11420; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11421; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11422; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11423; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11424; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 11425; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] 11426; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc 11427; GFX90A-NEXT: s_waitcnt vmcnt(0) 11428; GFX90A-NEXT: buffer_wbinvl1 11429; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 11430; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11431; GFX90A-NEXT: v_mov_b32_e32 v1, v6 11432; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 11433; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 11434; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11435; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 11436; GFX90A-NEXT: s_setpc_b64 s[30:31] 11437; 11438; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11439; GFX908: ; %bb.0: 11440; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11441; GFX908-NEXT: v_mov_b32_e32 v1, s20 11442; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11443; GFX908-NEXT: s_add_i32 s4, s20, 0x400 11444; GFX908-NEXT: s_mov_b64 s[6:7], 0 11445; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11446; GFX908-NEXT: s_movk_i32 s8, 0x7fff 11447; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11448; GFX908-NEXT: s_mov_b32 s9, 0x7060302 11449; GFX908-NEXT: v_mov_b32_e32 v4, s4 11450; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start 11451; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11452; GFX908-NEXT: s_waitcnt vmcnt(0) 11453; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11454; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11455; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 11456; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 11457; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 11458; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 11459; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 11460; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 11461; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 11462; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 11463; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11464; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11465; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11466; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11467; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 11468; GFX908-NEXT: v_mov_b32_e32 v6, v1 11469; GFX908-NEXT: v_mov_b32_e32 v5, v0 11470; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11471; GFX908-NEXT: s_waitcnt vmcnt(0) 11472; GFX908-NEXT: buffer_wbinvl1 11473; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 11474; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11475; GFX908-NEXT: v_mov_b32_e32 v1, v5 11476; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 11477; GFX908-NEXT: s_cbranch_execnz .LBB33_1 11478; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11479; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 11480; GFX908-NEXT: s_setpc_b64 s[30:31] 11481; 11482; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11483; GFX8: ; %bb.0: 11484; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11485; GFX8-NEXT: v_mov_b32_e32 v1, s20 11486; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 11487; GFX8-NEXT: s_add_i32 s4, s20, 0x400 11488; GFX8-NEXT: s_mov_b64 s[6:7], 0 11489; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 11490; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 11491; GFX8-NEXT: v_mov_b32_e32 v4, s4 11492; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start 11493; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11494; GFX8-NEXT: s_waitcnt vmcnt(0) 11495; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 11496; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 11497; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 11498; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 11499; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 11500; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11501; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 11502; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11503; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 11504; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11505; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11506; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11507; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 11508; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 11509; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11510; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 11511; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11512; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 11513; GFX8-NEXT: v_mov_b32_e32 v6, v1 11514; GFX8-NEXT: v_mov_b32_e32 v5, v0 11515; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 11516; GFX8-NEXT: s_waitcnt vmcnt(0) 11517; GFX8-NEXT: buffer_wbinvl1 11518; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 11519; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 11520; GFX8-NEXT: v_mov_b32_e32 v1, v5 11521; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 11522; GFX8-NEXT: s_cbranch_execnz .LBB33_1 11523; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11524; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 11525; GFX8-NEXT: s_setpc_b64 s[30:31] 11526; 11527; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11528; GFX7: ; %bb.0: 11529; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11530; GFX7-NEXT: v_mov_b32_e32 v2, s20 11531; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 11532; GFX7-NEXT: s_add_i32 s6, s20, 0x400 11533; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 11534; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 11535; GFX7-NEXT: s_mov_b64 s[4:5], 0 11536; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11537; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11538; GFX7-NEXT: s_waitcnt vmcnt(0) 11539; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 11540; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11541; GFX7-NEXT: v_mov_b32_e32 v2, s6 11542; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start 11543; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11544; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 11545; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 11546; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 11547; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 11548; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11549; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 11550; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 11551; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 11552; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 11553; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 11554; GFX7-NEXT: v_mov_b32_e32 v6, v4 11555; GFX7-NEXT: v_mov_b32_e32 v5, v3 11556; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 11557; GFX7-NEXT: s_waitcnt vmcnt(0) 11558; GFX7-NEXT: buffer_wbinvl1 11559; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 11560; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 11561; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11562; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 11563; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11564; GFX7-NEXT: s_cbranch_execnz .LBB33_1 11565; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11566; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11567; GFX7-NEXT: s_setpc_b64 s[30:31] 11568; 11569; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 11570; GFX6: ; %bb.0: 11571; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11572; GFX6-NEXT: v_mov_b32_e32 v2, s20 11573; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 11574; GFX6-NEXT: s_add_i32 s6, s20, 0x400 11575; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 11576; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 11577; GFX6-NEXT: s_mov_b64 s[4:5], 0 11578; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11579; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11580; GFX6-NEXT: s_waitcnt vmcnt(0) 11581; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 11582; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11583; GFX6-NEXT: v_mov_b32_e32 v2, s6 11584; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start 11585; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 11586; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 11587; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 11588; GFX6-NEXT: s_waitcnt expcnt(0) 11589; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 11590; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 11591; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11592; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 11593; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 11594; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 11595; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 11596; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 11597; GFX6-NEXT: v_mov_b32_e32 v6, v4 11598; GFX6-NEXT: v_mov_b32_e32 v5, v3 11599; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 11600; GFX6-NEXT: s_waitcnt vmcnt(0) 11601; GFX6-NEXT: buffer_wbinvl1 11602; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 11603; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 11604; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11605; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 11606; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 11607; GFX6-NEXT: s_cbranch_execnz .LBB33_1 11608; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 11609; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 11610; GFX6-NEXT: s_waitcnt expcnt(0) 11611; GFX6-NEXT: s_setpc_b64 s[30:31] 11612 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 11613 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 11614 ret void 11615} 11616 11617; -------------------------------------------------------------------- 11618; misc 11619; -------------------------------------------------------------------- 11620 11621define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 11622; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11623; GFX12: ; %bb.0: 11624; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11625; GFX12-NEXT: s_wait_expcnt 0x0 11626; GFX12-NEXT: s_wait_samplecnt 0x0 11627; GFX12-NEXT: s_wait_bvhcnt 0x0 11628; GFX12-NEXT: s_wait_kmcnt 0x0 11629; GFX12-NEXT: v_mov_b32_e32 v1, s16 11630; GFX12-NEXT: global_wb scope:SCOPE_SYS 11631; GFX12-NEXT: s_wait_storecnt 0x0 11632; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 11633; GFX12-NEXT: s_wait_loadcnt 0x0 11634; GFX12-NEXT: global_inv scope:SCOPE_SYS 11635; GFX12-NEXT: s_setpc_b64 s[30:31] 11636; 11637; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11638; GFX940: ; %bb.0: 11639; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11640; GFX940-NEXT: v_mov_b32_e32 v1, s16 11641; GFX940-NEXT: buffer_wbl2 sc0 sc1 11642; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 11643; GFX940-NEXT: s_waitcnt vmcnt(0) 11644; GFX940-NEXT: buffer_inv sc0 sc1 11645; GFX940-NEXT: s_setpc_b64 s[30:31] 11646; 11647; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11648; GFX11: ; %bb.0: 11649; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11650; GFX11-NEXT: v_mov_b32_e32 v1, s16 11651; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11652; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 11653; GFX11-NEXT: s_waitcnt vmcnt(0) 11654; GFX11-NEXT: buffer_gl1_inv 11655; GFX11-NEXT: buffer_gl0_inv 11656; GFX11-NEXT: s_setpc_b64 s[30:31] 11657; 11658; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11659; GFX10: ; %bb.0: 11660; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11661; GFX10-NEXT: v_mov_b32_e32 v2, v0 11662; GFX10-NEXT: v_mov_b32_e32 v0, s20 11663; GFX10-NEXT: s_add_i32 s4, s20, 0x400 11664; GFX10-NEXT: v_mov_b32_e32 v3, s4 11665; GFX10-NEXT: s_mov_b32 s4, 0 11666; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11667; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start 11668; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11669; GFX10-NEXT: s_waitcnt vmcnt(0) 11670; GFX10-NEXT: v_mov_b32_e32 v5, v0 11671; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11672; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 11673; GFX10-NEXT: v_mov_b32_e32 v0, v4 11674; GFX10-NEXT: v_mov_b32_e32 v1, v5 11675; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11676; GFX10-NEXT: s_waitcnt vmcnt(0) 11677; GFX10-NEXT: buffer_gl1_inv 11678; GFX10-NEXT: buffer_gl0_inv 11679; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 11680; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11681; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11682; GFX10-NEXT: s_cbranch_execnz .LBB34_1 11683; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11684; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11685; GFX10-NEXT: s_setpc_b64 s[30:31] 11686; 11687; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11688; GFX90A: ; %bb.0: 11689; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11690; GFX90A-NEXT: v_mov_b32_e32 v2, v0 11691; GFX90A-NEXT: v_mov_b32_e32 v0, s20 11692; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11693; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 11694; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11695; GFX90A-NEXT: v_mov_b32_e32 v3, s6 11696; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start 11697; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11698; GFX90A-NEXT: s_waitcnt vmcnt(0) 11699; GFX90A-NEXT: v_mov_b32_e32 v5, v0 11700; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 11701; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 11702; GFX90A-NEXT: buffer_wbl2 11703; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11704; GFX90A-NEXT: s_waitcnt vmcnt(0) 11705; GFX90A-NEXT: buffer_invl2 11706; GFX90A-NEXT: buffer_wbinvl1 11707; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 11708; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11709; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11710; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 11711; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11712; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11713; GFX90A-NEXT: s_setpc_b64 s[30:31] 11714; 11715; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11716; GFX908: ; %bb.0: 11717; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11718; GFX908-NEXT: v_mov_b32_e32 v2, v0 11719; GFX908-NEXT: v_mov_b32_e32 v0, s20 11720; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11721; GFX908-NEXT: s_add_i32 s6, s20, 0x400 11722; GFX908-NEXT: s_mov_b64 s[4:5], 0 11723; GFX908-NEXT: v_mov_b32_e32 v3, s6 11724; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start 11725; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11726; GFX908-NEXT: s_waitcnt vmcnt(0) 11727; GFX908-NEXT: v_mov_b32_e32 v5, v0 11728; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 11729; GFX908-NEXT: v_mov_b32_e32 v0, v4 11730; GFX908-NEXT: v_mov_b32_e32 v1, v5 11731; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11732; GFX908-NEXT: s_waitcnt vmcnt(0) 11733; GFX908-NEXT: buffer_wbinvl1 11734; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 11735; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11736; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11737; GFX908-NEXT: s_cbranch_execnz .LBB34_1 11738; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11739; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11740; GFX908-NEXT: s_setpc_b64 s[30:31] 11741; 11742; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11743; GFX8: ; %bb.0: 11744; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11745; GFX8-NEXT: v_mov_b32_e32 v2, v0 11746; GFX8-NEXT: v_mov_b32_e32 v0, s20 11747; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11748; GFX8-NEXT: s_add_i32 s6, s20, 0x400 11749; GFX8-NEXT: s_mov_b64 s[4:5], 0 11750; GFX8-NEXT: v_mov_b32_e32 v3, s6 11751; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start 11752; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11753; GFX8-NEXT: s_waitcnt vmcnt(0) 11754; GFX8-NEXT: v_mov_b32_e32 v5, v0 11755; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 11756; GFX8-NEXT: v_mov_b32_e32 v0, v4 11757; GFX8-NEXT: v_mov_b32_e32 v1, v5 11758; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11759; GFX8-NEXT: s_waitcnt vmcnt(0) 11760; GFX8-NEXT: buffer_wbinvl1 11761; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 11762; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11763; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11764; GFX8-NEXT: s_cbranch_execnz .LBB34_1 11765; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11766; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11767; GFX8-NEXT: s_setpc_b64 s[30:31] 11768; 11769; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11770; GFX7: ; %bb.0: 11771; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11772; GFX7-NEXT: v_mov_b32_e32 v2, v0 11773; GFX7-NEXT: v_mov_b32_e32 v0, s20 11774; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11775; GFX7-NEXT: s_add_i32 s6, s20, 0x400 11776; GFX7-NEXT: s_mov_b64 s[4:5], 0 11777; GFX7-NEXT: v_mov_b32_e32 v3, s6 11778; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start 11779; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11780; GFX7-NEXT: s_waitcnt vmcnt(0) 11781; GFX7-NEXT: v_mov_b32_e32 v5, v0 11782; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 11783; GFX7-NEXT: v_mov_b32_e32 v0, v4 11784; GFX7-NEXT: v_mov_b32_e32 v1, v5 11785; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11786; GFX7-NEXT: s_waitcnt vmcnt(0) 11787; GFX7-NEXT: buffer_wbinvl1 11788; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 11789; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11790; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11791; GFX7-NEXT: s_cbranch_execnz .LBB34_1 11792; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11793; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11794; GFX7-NEXT: s_setpc_b64 s[30:31] 11795; 11796; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: 11797; GFX6: ; %bb.0: 11798; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11799; GFX6-NEXT: v_mov_b32_e32 v2, v0 11800; GFX6-NEXT: v_mov_b32_e32 v0, s20 11801; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 11802; GFX6-NEXT: s_add_i32 s6, s20, 0x400 11803; GFX6-NEXT: s_mov_b64 s[4:5], 0 11804; GFX6-NEXT: v_mov_b32_e32 v3, s6 11805; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start 11806; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 11807; GFX6-NEXT: s_waitcnt vmcnt(0) 11808; GFX6-NEXT: v_mov_b32_e32 v5, v0 11809; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 11810; GFX6-NEXT: s_waitcnt expcnt(0) 11811; GFX6-NEXT: v_mov_b32_e32 v0, v4 11812; GFX6-NEXT: v_mov_b32_e32 v1, v5 11813; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 11814; GFX6-NEXT: s_waitcnt vmcnt(0) 11815; GFX6-NEXT: buffer_wbinvl1 11816; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 11817; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11818; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 11819; GFX6-NEXT: s_cbranch_execnz .LBB34_1 11820; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 11821; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 11822; GFX6-NEXT: s_waitcnt expcnt(0) 11823; GFX6-NEXT: s_setpc_b64 s[30:31] 11824 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 11825 %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 11826 ret float %result 11827} 11828 11829attributes #0 = { nounwind } 11830 11831!0 = !{} 11832