1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10 11; -------------------------------------------------------------------- 12; float 13; -------------------------------------------------------------------- 14 15define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { 16; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32: 17; GFX12: ; %bb.0: 18; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19; GFX12-NEXT: s_wait_expcnt 0x0 20; GFX12-NEXT: s_wait_samplecnt 0x0 21; GFX12-NEXT: s_wait_bvhcnt 0x0 22; GFX12-NEXT: s_wait_kmcnt 0x0 23; GFX12-NEXT: flat_load_b32 v3, v[0:1] 24; GFX12-NEXT: s_mov_b32 s0, 0 25; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start 26; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 27; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 28; GFX12-NEXT: v_mov_b32_e32 v4, v3 29; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 30; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 31; GFX12-NEXT: s_wait_storecnt 0x0 32; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 33; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 34; GFX12-NEXT: global_inv scope:SCOPE_DEV 35; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 36; GFX12-NEXT: s_wait_alu 0xfffe 37; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 38; GFX12-NEXT: s_wait_alu 0xfffe 39; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 40; GFX12-NEXT: s_cbranch_execnz .LBB0_1 41; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 42; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 43; GFX12-NEXT: v_mov_b32_e32 v0, v3 44; GFX12-NEXT: s_wait_alu 0xfffe 45; GFX12-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32: 48; GFX940: ; %bb.0: 49; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX940-NEXT: flat_load_dword v3, v[0:1] 51; GFX940-NEXT: s_mov_b64 s[0:1], 0 52; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start 53; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 54; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 55; GFX940-NEXT: v_mov_b32_e32 v5, v3 56; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 57; GFX940-NEXT: buffer_wbl2 sc1 58; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 59; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 60; GFX940-NEXT: buffer_inv sc1 61; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 62; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 63; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 64; GFX940-NEXT: s_cbranch_execnz .LBB0_1 65; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 66; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 67; GFX940-NEXT: v_mov_b32_e32 v0, v3 68; GFX940-NEXT: s_setpc_b64 s[30:31] 69; 70; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32: 71; GFX11: ; %bb.0: 72; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; GFX11-NEXT: flat_load_b32 v3, v[0:1] 74; GFX11-NEXT: s_mov_b32 s0, 0 75; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start 76; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 77; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 78; GFX11-NEXT: v_mov_b32_e32 v4, v3 79; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 80; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 81; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 82; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 83; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 84; GFX11-NEXT: buffer_gl1_inv 85; GFX11-NEXT: buffer_gl0_inv 86; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 87; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 88; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 89; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 90; GFX11-NEXT: s_cbranch_execnz .LBB0_1 91; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 92; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 93; GFX11-NEXT: v_mov_b32_e32 v0, v3 94; GFX11-NEXT: s_setpc_b64 s[30:31] 95; 96; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32: 97; GFX10: ; %bb.0: 98; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX10-NEXT: flat_load_dword v3, v[0:1] 100; GFX10-NEXT: s_mov_b32 s4, 0 101; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start 102; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 103; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 104; GFX10-NEXT: v_mov_b32_e32 v4, v3 105; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 106; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 107; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 108; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 109; GFX10-NEXT: buffer_gl1_inv 110; GFX10-NEXT: buffer_gl0_inv 111; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 112; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 113; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 114; GFX10-NEXT: s_cbranch_execnz .LBB0_1 115; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 116; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 117; GFX10-NEXT: v_mov_b32_e32 v0, v3 118; GFX10-NEXT: s_setpc_b64 s[30:31] 119; 120; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32: 121; GFX90A: ; %bb.0: 122; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX90A-NEXT: flat_load_dword v3, v[0:1] 124; GFX90A-NEXT: s_mov_b64 s[4:5], 0 125; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start 126; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 127; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 128; GFX90A-NEXT: v_mov_b32_e32 v5, v3 129; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 130; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 131; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 132; GFX90A-NEXT: buffer_wbinvl1 133; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 134; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 135; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 136; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 137; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 138; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 139; GFX90A-NEXT: v_mov_b32_e32 v0, v3 140; GFX90A-NEXT: s_setpc_b64 s[30:31] 141; 142; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32: 143; GFX908: ; %bb.0: 144; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 145; GFX908-NEXT: flat_load_dword v3, v[0:1] 146; GFX908-NEXT: s_mov_b64 s[4:5], 0 147; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 148; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 149; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 150; GFX908-NEXT: v_mov_b32_e32 v4, v3 151; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 152; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 153; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX908-NEXT: buffer_wbinvl1 155; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 156; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 157; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 158; GFX908-NEXT: s_cbranch_execnz .LBB0_1 159; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 160; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 161; GFX908-NEXT: v_mov_b32_e32 v0, v3 162; GFX908-NEXT: s_setpc_b64 s[30:31] 163; 164; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32: 165; GFX8: ; %bb.0: 166; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; GFX8-NEXT: flat_load_dword v3, v[0:1] 168; GFX8-NEXT: s_mov_b64 s[4:5], 0 169; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start 170; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 171; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 172; GFX8-NEXT: v_mov_b32_e32 v4, v3 173; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 174; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 175; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 176; GFX8-NEXT: buffer_wbinvl1 177; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 178; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 179; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 180; GFX8-NEXT: s_cbranch_execnz .LBB0_1 181; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 182; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 183; GFX8-NEXT: v_mov_b32_e32 v0, v3 184; GFX8-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32: 187; GFX7: ; %bb.0: 188; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX7-NEXT: flat_load_dword v3, v[0:1] 190; GFX7-NEXT: s_mov_b64 s[4:5], 0 191; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start 192; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 193; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 194; GFX7-NEXT: v_mov_b32_e32 v4, v3 195; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 196; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 197; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 198; GFX7-NEXT: buffer_wbinvl1 199; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 200; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 201; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 202; GFX7-NEXT: s_cbranch_execnz .LBB0_1 203; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 204; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 205; GFX7-NEXT: v_mov_b32_e32 v0, v3 206; GFX7-NEXT: s_setpc_b64 s[30:31] 207 %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst 208 ret float %result 209} 210 211define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 { 212; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 213; GFX12: ; %bb.0: 214; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 215; GFX12-NEXT: s_wait_expcnt 0x0 216; GFX12-NEXT: s_wait_samplecnt 0x0 217; GFX12-NEXT: s_wait_bvhcnt 0x0 218; GFX12-NEXT: s_wait_kmcnt 0x0 219; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 220; GFX12-NEXT: s_mov_b32 s0, 0 221; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start 222; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 223; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 224; GFX12-NEXT: v_mov_b32_e32 v4, v3 225; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 226; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 227; GFX12-NEXT: s_wait_storecnt 0x0 228; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 229; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 230; GFX12-NEXT: global_inv scope:SCOPE_DEV 231; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 232; GFX12-NEXT: s_wait_alu 0xfffe 233; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 234; GFX12-NEXT: s_wait_alu 0xfffe 235; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 236; GFX12-NEXT: s_cbranch_execnz .LBB1_1 237; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 238; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 239; GFX12-NEXT: v_mov_b32_e32 v0, v3 240; GFX12-NEXT: s_wait_alu 0xfffe 241; GFX12-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 244; GFX940: ; %bb.0: 245; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 247; GFX940-NEXT: s_mov_b64 s[0:1], 0 248; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start 249; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 250; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 251; GFX940-NEXT: v_mov_b32_e32 v5, v3 252; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 253; GFX940-NEXT: buffer_wbl2 sc1 254; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 255; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 256; GFX940-NEXT: buffer_inv sc1 257; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 258; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 259; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 260; GFX940-NEXT: s_cbranch_execnz .LBB1_1 261; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 262; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 263; GFX940-NEXT: v_mov_b32_e32 v0, v3 264; GFX940-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 267; GFX11: ; %bb.0: 268; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 270; GFX11-NEXT: s_mov_b32 s0, 0 271; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start 272; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 273; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 274; GFX11-NEXT: v_mov_b32_e32 v4, v3 275; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 276; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 277; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 278; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 279; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 280; GFX11-NEXT: buffer_gl1_inv 281; GFX11-NEXT: buffer_gl0_inv 282; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 283; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 284; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 285; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 286; GFX11-NEXT: s_cbranch_execnz .LBB1_1 287; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 288; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 289; GFX11-NEXT: v_mov_b32_e32 v0, v3 290; GFX11-NEXT: s_setpc_b64 s[30:31] 291; 292; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 293; GFX10: ; %bb.0: 294; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 295; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 296; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 297; GFX10-NEXT: s_mov_b32 s4, 0 298; GFX10-NEXT: flat_load_dword v0, v[3:4] 299; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start 300; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 301; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 302; GFX10-NEXT: v_mov_b32_e32 v1, v0 303; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 304; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 305; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 306; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 307; GFX10-NEXT: buffer_gl1_inv 308; GFX10-NEXT: buffer_gl0_inv 309; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 310; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 311; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 312; GFX10-NEXT: s_cbranch_execnz .LBB1_1 313; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 314; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 315; GFX10-NEXT: s_setpc_b64 s[30:31] 316; 317; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 318; GFX90A: ; %bb.0: 319; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 320; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 321; GFX90A-NEXT: s_mov_b64 s[4:5], 0 322; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start 323; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 324; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 325; GFX90A-NEXT: v_mov_b32_e32 v5, v3 326; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 327; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 328; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 329; GFX90A-NEXT: buffer_wbinvl1 330; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 331; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 332; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 333; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 334; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 335; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 336; GFX90A-NEXT: v_mov_b32_e32 v0, v3 337; GFX90A-NEXT: s_setpc_b64 s[30:31] 338; 339; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 340; GFX908: ; %bb.0: 341; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 343; GFX908-NEXT: s_mov_b64 s[4:5], 0 344; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start 345; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 346; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 347; GFX908-NEXT: v_mov_b32_e32 v4, v3 348; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 349; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 350; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 351; GFX908-NEXT: buffer_wbinvl1 352; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 353; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 354; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 355; GFX908-NEXT: s_cbranch_execnz .LBB1_1 356; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 357; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 358; GFX908-NEXT: v_mov_b32_e32 v0, v3 359; GFX908-NEXT: s_setpc_b64 s[30:31] 360; 361; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 362; GFX8: ; %bb.0: 363; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 365; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 366; GFX8-NEXT: flat_load_dword v0, v[3:4] 367; GFX8-NEXT: s_mov_b64 s[4:5], 0 368; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start 369; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 370; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 371; GFX8-NEXT: v_mov_b32_e32 v1, v0 372; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 373; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 374; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 375; GFX8-NEXT: buffer_wbinvl1 376; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 377; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 378; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 379; GFX8-NEXT: s_cbranch_execnz .LBB1_1 380; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 381; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 382; GFX8-NEXT: s_setpc_b64 s[30:31] 383; 384; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: 385; GFX7: ; %bb.0: 386; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 388; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 389; GFX7-NEXT: flat_load_dword v0, v[3:4] 390; GFX7-NEXT: s_mov_b64 s[4:5], 0 391; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start 392; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 393; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 394; GFX7-NEXT: v_mov_b32_e32 v1, v0 395; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 396; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 397; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 398; GFX7-NEXT: buffer_wbinvl1 399; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 400; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 401; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 402; GFX7-NEXT: s_cbranch_execnz .LBB1_1 403; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 404; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 405; GFX7-NEXT: s_setpc_b64 s[30:31] 406 %gep = getelementptr float, ptr %ptr, i64 511 407 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 408 ret float %result 409} 410 411define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 { 412; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 413; GFX12: ; %bb.0: 414; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 415; GFX12-NEXT: s_wait_expcnt 0x0 416; GFX12-NEXT: s_wait_samplecnt 0x0 417; GFX12-NEXT: s_wait_bvhcnt 0x0 418; GFX12-NEXT: s_wait_kmcnt 0x0 419; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 420; GFX12-NEXT: s_mov_b32 s0, 0 421; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start 422; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 423; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 424; GFX12-NEXT: v_mov_b32_e32 v4, v3 425; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 426; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 427; GFX12-NEXT: s_wait_storecnt 0x0 428; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 429; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 430; GFX12-NEXT: global_inv scope:SCOPE_DEV 431; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 432; GFX12-NEXT: s_wait_alu 0xfffe 433; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 434; GFX12-NEXT: s_wait_alu 0xfffe 435; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 436; GFX12-NEXT: s_cbranch_execnz .LBB2_1 437; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 438; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 439; GFX12-NEXT: v_mov_b32_e32 v0, v3 440; GFX12-NEXT: s_wait_alu 0xfffe 441; GFX12-NEXT: s_setpc_b64 s[30:31] 442; 443; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 444; GFX940: ; %bb.0: 445; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 446; GFX940-NEXT: v_mov_b32_e32 v4, v0 447; GFX940-NEXT: v_mov_b32_e32 v5, v1 448; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 449; GFX940-NEXT: s_movk_i32 s0, 0xf800 450; GFX940-NEXT: s_nop 0 451; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 452; GFX940-NEXT: flat_load_dword v0, v[0:1] 453; GFX940-NEXT: s_mov_b32 s1, -1 454; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 455; GFX940-NEXT: s_mov_b64 s[0:1], 0 456; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start 457; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 458; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 459; GFX940-NEXT: v_mov_b32_e32 v1, v0 460; GFX940-NEXT: v_sub_f32_e32 v0, v1, v2 461; GFX940-NEXT: buffer_wbl2 sc1 462; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 463; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 464; GFX940-NEXT: buffer_inv sc1 465; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 466; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 467; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 468; GFX940-NEXT: s_cbranch_execnz .LBB2_1 469; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 470; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 471; GFX940-NEXT: s_setpc_b64 s[30:31] 472; 473; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 474; GFX11: ; %bb.0: 475; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 476; GFX11-NEXT: v_mov_b32_e32 v3, v0 477; GFX11-NEXT: s_mov_b32 s0, 0 478; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 479; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 480; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 481; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 482; GFX11-NEXT: flat_load_b32 v0, v[4:5] 483; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 484; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start 485; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 486; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 487; GFX11-NEXT: v_mov_b32_e32 v1, v0 488; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 489; GFX11-NEXT: v_sub_f32_e32 v0, v1, v2 490; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 491; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc 492; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 493; GFX11-NEXT: buffer_gl1_inv 494; GFX11-NEXT: buffer_gl0_inv 495; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 496; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 497; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 498; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 499; GFX11-NEXT: s_cbranch_execnz .LBB2_1 500; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 501; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 502; GFX11-NEXT: s_setpc_b64 s[30:31] 503; 504; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 505; GFX10: ; %bb.0: 506; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 508; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 509; GFX10-NEXT: s_mov_b32 s4, 0 510; GFX10-NEXT: flat_load_dword v0, v[3:4] 511; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start 512; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 513; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 514; GFX10-NEXT: v_mov_b32_e32 v1, v0 515; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 516; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 517; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 518; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 519; GFX10-NEXT: buffer_gl1_inv 520; GFX10-NEXT: buffer_gl0_inv 521; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 522; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 523; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 524; GFX10-NEXT: s_cbranch_execnz .LBB2_1 525; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 526; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 527; GFX10-NEXT: s_setpc_b64 s[30:31] 528; 529; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 530; GFX90A: ; %bb.0: 531; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 532; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 533; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 534; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 535; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 536; GFX90A-NEXT: flat_load_dword v0, v[0:1] 537; GFX90A-NEXT: s_mov_b64 s[4:5], 0 538; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start 539; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 540; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 541; GFX90A-NEXT: v_mov_b32_e32 v1, v0 542; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 543; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 544; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 545; GFX90A-NEXT: buffer_wbinvl1 546; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 547; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 548; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 549; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 550; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 551; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 552; GFX90A-NEXT: s_setpc_b64 s[30:31] 553; 554; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 555; GFX908: ; %bb.0: 556; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 558; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 559; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 560; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 561; GFX908-NEXT: flat_load_dword v0, v[0:1] 562; GFX908-NEXT: s_mov_b64 s[4:5], 0 563; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start 564; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 565; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 566; GFX908-NEXT: v_mov_b32_e32 v1, v0 567; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 568; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 569; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; GFX908-NEXT: buffer_wbinvl1 571; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 572; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 573; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 574; GFX908-NEXT: s_cbranch_execnz .LBB2_1 575; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 576; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 577; GFX908-NEXT: s_setpc_b64 s[30:31] 578; 579; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 580; GFX8: ; %bb.0: 581; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 583; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 584; GFX8-NEXT: flat_load_dword v0, v[3:4] 585; GFX8-NEXT: s_mov_b64 s[4:5], 0 586; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start 587; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 588; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 589; GFX8-NEXT: v_mov_b32_e32 v1, v0 590; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 591; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 592; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 593; GFX8-NEXT: buffer_wbinvl1 594; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 595; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 596; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 597; GFX8-NEXT: s_cbranch_execnz .LBB2_1 598; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 599; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 600; GFX8-NEXT: s_setpc_b64 s[30:31] 601; 602; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: 603; GFX7: ; %bb.0: 604; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 605; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 606; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 607; GFX7-NEXT: flat_load_dword v0, v[3:4] 608; GFX7-NEXT: s_mov_b64 s[4:5], 0 609; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start 610; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 611; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 612; GFX7-NEXT: v_mov_b32_e32 v1, v0 613; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 614; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 615; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 616; GFX7-NEXT: buffer_wbinvl1 617; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 618; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 619; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 620; GFX7-NEXT: s_cbranch_execnz .LBB2_1 621; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 622; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 623; GFX7-NEXT: s_setpc_b64 s[30:31] 624 %gep = getelementptr float, ptr %ptr, i64 -512 625 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 626 ret float %result 627} 628 629define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { 630; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32: 631; GFX12: ; %bb.0: 632; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 633; GFX12-NEXT: s_wait_expcnt 0x0 634; GFX12-NEXT: s_wait_samplecnt 0x0 635; GFX12-NEXT: s_wait_bvhcnt 0x0 636; GFX12-NEXT: s_wait_kmcnt 0x0 637; GFX12-NEXT: flat_load_b32 v4, v[0:1] 638; GFX12-NEXT: s_mov_b32 s0, 0 639; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start 640; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 641; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 642; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 643; GFX12-NEXT: s_wait_storecnt 0x0 644; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 645; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 646; GFX12-NEXT: global_inv scope:SCOPE_DEV 647; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 648; GFX12-NEXT: v_mov_b32_e32 v4, v3 649; GFX12-NEXT: s_wait_alu 0xfffe 650; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 651; GFX12-NEXT: s_wait_alu 0xfffe 652; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 653; GFX12-NEXT: s_cbranch_execnz .LBB3_1 654; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 655; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 656; GFX12-NEXT: s_wait_alu 0xfffe 657; GFX12-NEXT: s_setpc_b64 s[30:31] 658; 659; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: 660; GFX940: ; %bb.0: 661; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 662; GFX940-NEXT: flat_load_dword v5, v[0:1] 663; GFX940-NEXT: s_mov_b64 s[0:1], 0 664; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start 665; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 666; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 667; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 668; GFX940-NEXT: buffer_wbl2 sc1 669; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 670; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 671; GFX940-NEXT: buffer_inv sc1 672; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 673; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 674; GFX940-NEXT: v_mov_b32_e32 v5, v3 675; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 676; GFX940-NEXT: s_cbranch_execnz .LBB3_1 677; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 678; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 679; GFX940-NEXT: s_setpc_b64 s[30:31] 680; 681; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32: 682; GFX11: ; %bb.0: 683; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 684; GFX11-NEXT: flat_load_b32 v4, v[0:1] 685; GFX11-NEXT: s_mov_b32 s0, 0 686; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start 687; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 688; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 689; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 690; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 691; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 692; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 693; GFX11-NEXT: buffer_gl1_inv 694; GFX11-NEXT: buffer_gl0_inv 695; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 696; GFX11-NEXT: v_mov_b32_e32 v4, v3 697; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 698; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 699; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 700; GFX11-NEXT: s_cbranch_execnz .LBB3_1 701; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 702; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 703; GFX11-NEXT: s_setpc_b64 s[30:31] 704; 705; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32: 706; GFX10: ; %bb.0: 707; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 708; GFX10-NEXT: flat_load_dword v4, v[0:1] 709; GFX10-NEXT: s_mov_b32 s4, 0 710; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start 711; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 712; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 713; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 714; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 715; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 716; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 717; GFX10-NEXT: buffer_gl1_inv 718; GFX10-NEXT: buffer_gl0_inv 719; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 720; GFX10-NEXT: v_mov_b32_e32 v4, v3 721; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 722; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 723; GFX10-NEXT: s_cbranch_execnz .LBB3_1 724; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 725; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 726; GFX10-NEXT: s_setpc_b64 s[30:31] 727; 728; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32: 729; GFX90A: ; %bb.0: 730; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 731; GFX90A-NEXT: flat_load_dword v5, v[0:1] 732; GFX90A-NEXT: s_mov_b64 s[4:5], 0 733; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start 734; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 735; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 736; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 737; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 738; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 739; GFX90A-NEXT: buffer_wbinvl1 740; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 741; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 742; GFX90A-NEXT: v_mov_b32_e32 v5, v3 743; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 744; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 745; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 746; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 747; GFX90A-NEXT: s_setpc_b64 s[30:31] 748; 749; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32: 750; GFX908: ; %bb.0: 751; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GFX908-NEXT: flat_load_dword v4, v[0:1] 753; GFX908-NEXT: s_mov_b64 s[4:5], 0 754; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start 755; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 756; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 757; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 758; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 759; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 760; GFX908-NEXT: buffer_wbinvl1 761; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 762; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 763; GFX908-NEXT: v_mov_b32_e32 v4, v3 764; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 765; GFX908-NEXT: s_cbranch_execnz .LBB3_1 766; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 767; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 768; GFX908-NEXT: s_setpc_b64 s[30:31] 769; 770; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32: 771; GFX8: ; %bb.0: 772; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 773; GFX8-NEXT: flat_load_dword v4, v[0:1] 774; GFX8-NEXT: s_mov_b64 s[4:5], 0 775; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start 776; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 777; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 778; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 779; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 780; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 781; GFX8-NEXT: buffer_wbinvl1 782; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 783; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 784; GFX8-NEXT: v_mov_b32_e32 v4, v3 785; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 786; GFX8-NEXT: s_cbranch_execnz .LBB3_1 787; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 788; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 789; GFX8-NEXT: s_setpc_b64 s[30:31] 790; 791; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32: 792; GFX7: ; %bb.0: 793; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 794; GFX7-NEXT: flat_load_dword v4, v[0:1] 795; GFX7-NEXT: s_mov_b64 s[4:5], 0 796; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start 797; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 798; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 799; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 800; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 801; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 802; GFX7-NEXT: buffer_wbinvl1 803; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 804; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 805; GFX7-NEXT: v_mov_b32_e32 v4, v3 806; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 807; GFX7-NEXT: s_cbranch_execnz .LBB3_1 808; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 809; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 810; GFX7-NEXT: s_setpc_b64 s[30:31] 811 %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst 812 ret void 813} 814 815define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 { 816; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 817; GFX12: ; %bb.0: 818; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 819; GFX12-NEXT: s_wait_expcnt 0x0 820; GFX12-NEXT: s_wait_samplecnt 0x0 821; GFX12-NEXT: s_wait_bvhcnt 0x0 822; GFX12-NEXT: s_wait_kmcnt 0x0 823; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 824; GFX12-NEXT: s_mov_b32 s0, 0 825; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start 826; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 827; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 828; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 829; GFX12-NEXT: s_wait_storecnt 0x0 830; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 831; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 832; GFX12-NEXT: global_inv scope:SCOPE_DEV 833; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 834; GFX12-NEXT: v_mov_b32_e32 v4, v3 835; GFX12-NEXT: s_wait_alu 0xfffe 836; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 837; GFX12-NEXT: s_wait_alu 0xfffe 838; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 839; GFX12-NEXT: s_cbranch_execnz .LBB4_1 840; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 841; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 842; GFX12-NEXT: s_wait_alu 0xfffe 843; GFX12-NEXT: s_setpc_b64 s[30:31] 844; 845; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 846; GFX940: ; %bb.0: 847; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 848; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 849; GFX940-NEXT: s_mov_b64 s[0:1], 0 850; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start 851; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 852; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 853; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 854; GFX940-NEXT: buffer_wbl2 sc1 855; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 856; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 857; GFX940-NEXT: buffer_inv sc1 858; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 859; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 860; GFX940-NEXT: v_mov_b32_e32 v5, v3 861; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 862; GFX940-NEXT: s_cbranch_execnz .LBB4_1 863; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 864; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 865; GFX940-NEXT: s_setpc_b64 s[30:31] 866; 867; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 868; GFX11: ; %bb.0: 869; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 870; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 871; GFX11-NEXT: s_mov_b32 s0, 0 872; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start 873; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 874; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 875; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 876; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 877; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 878; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 879; GFX11-NEXT: buffer_gl1_inv 880; GFX11-NEXT: buffer_gl0_inv 881; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 882; GFX11-NEXT: v_mov_b32_e32 v4, v3 883; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 884; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 885; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 886; GFX11-NEXT: s_cbranch_execnz .LBB4_1 887; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 888; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 889; GFX11-NEXT: s_setpc_b64 s[30:31] 890; 891; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 892; GFX10: ; %bb.0: 893; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 894; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 895; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 896; GFX10-NEXT: s_mov_b32 s4, 0 897; GFX10-NEXT: flat_load_dword v4, v[0:1] 898; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start 899; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 900; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 901; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 902; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 903; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 904; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 905; GFX10-NEXT: buffer_gl1_inv 906; GFX10-NEXT: buffer_gl0_inv 907; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 908; GFX10-NEXT: v_mov_b32_e32 v4, v3 909; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 910; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 911; GFX10-NEXT: s_cbranch_execnz .LBB4_1 912; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 913; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 914; GFX10-NEXT: s_setpc_b64 s[30:31] 915; 916; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 917; GFX90A: ; %bb.0: 918; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 920; GFX90A-NEXT: s_mov_b64 s[4:5], 0 921; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start 922; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 923; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 924; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 925; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 926; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 927; GFX90A-NEXT: buffer_wbinvl1 928; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 929; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 930; GFX90A-NEXT: v_mov_b32_e32 v5, v3 931; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 932; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 933; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 934; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 935; GFX90A-NEXT: s_setpc_b64 s[30:31] 936; 937; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 938; GFX908: ; %bb.0: 939; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 940; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 941; GFX908-NEXT: s_mov_b64 s[4:5], 0 942; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start 943; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 944; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 945; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 946; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 947; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 948; GFX908-NEXT: buffer_wbinvl1 949; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 950; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 951; GFX908-NEXT: v_mov_b32_e32 v4, v3 952; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 953; GFX908-NEXT: s_cbranch_execnz .LBB4_1 954; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 955; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 956; GFX908-NEXT: s_setpc_b64 s[30:31] 957; 958; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 959; GFX8: ; %bb.0: 960; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 961; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 962; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 963; GFX8-NEXT: flat_load_dword v4, v[0:1] 964; GFX8-NEXT: s_mov_b64 s[4:5], 0 965; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 966; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 967; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 968; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 969; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 970; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 971; GFX8-NEXT: buffer_wbinvl1 972; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 973; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 974; GFX8-NEXT: v_mov_b32_e32 v4, v3 975; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 976; GFX8-NEXT: s_cbranch_execnz .LBB4_1 977; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 978; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 979; GFX8-NEXT: s_setpc_b64 s[30:31] 980; 981; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: 982; GFX7: ; %bb.0: 983; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 984; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 985; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 986; GFX7-NEXT: flat_load_dword v4, v[0:1] 987; GFX7-NEXT: s_mov_b64 s[4:5], 0 988; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start 989; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 990; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 991; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 992; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 993; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 994; GFX7-NEXT: buffer_wbinvl1 995; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 996; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 997; GFX7-NEXT: v_mov_b32_e32 v4, v3 998; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 999; GFX7-NEXT: s_cbranch_execnz .LBB4_1 1000; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1001; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1002; GFX7-NEXT: s_setpc_b64 s[30:31] 1003 %gep = getelementptr float, ptr %ptr, i64 511 1004 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 1005 ret void 1006} 1007 1008define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 { 1009; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1010; GFX12: ; %bb.0: 1011; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1012; GFX12-NEXT: s_wait_expcnt 0x0 1013; GFX12-NEXT: s_wait_samplecnt 0x0 1014; GFX12-NEXT: s_wait_bvhcnt 0x0 1015; GFX12-NEXT: s_wait_kmcnt 0x0 1016; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 1017; GFX12-NEXT: s_mov_b32 s0, 0 1018; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start 1019; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1020; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1021; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 1022; GFX12-NEXT: s_wait_storecnt 0x0 1023; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1024; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1025; GFX12-NEXT: global_inv scope:SCOPE_DEV 1026; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1027; GFX12-NEXT: v_mov_b32_e32 v4, v3 1028; GFX12-NEXT: s_wait_alu 0xfffe 1029; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1030; GFX12-NEXT: s_wait_alu 0xfffe 1031; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1032; GFX12-NEXT: s_cbranch_execnz .LBB5_1 1033; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1034; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1035; GFX12-NEXT: s_wait_alu 0xfffe 1036; GFX12-NEXT: s_setpc_b64 s[30:31] 1037; 1038; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1039; GFX940: ; %bb.0: 1040; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1041; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 1042; GFX940-NEXT: s_movk_i32 s0, 0xf800 1043; GFX940-NEXT: s_nop 0 1044; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 1045; GFX940-NEXT: flat_load_dword v5, v[4:5] 1046; GFX940-NEXT: s_mov_b32 s1, -1 1047; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 1048; GFX940-NEXT: s_mov_b64 s[0:1], 0 1049; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start 1050; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1051; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1052; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 1053; GFX940-NEXT: buffer_wbl2 sc1 1054; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 1055; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1056; GFX940-NEXT: buffer_inv sc1 1057; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1058; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1059; GFX940-NEXT: v_mov_b32_e32 v5, v3 1060; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1061; GFX940-NEXT: s_cbranch_execnz .LBB5_1 1062; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1063; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1064; GFX940-NEXT: s_setpc_b64 s[30:31] 1065; 1066; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1067; GFX11: ; %bb.0: 1068; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1069; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 1070; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 1071; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1072; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1073; GFX11-NEXT: flat_load_b32 v4, v[3:4] 1074; GFX11-NEXT: s_mov_b32 s0, 0 1075; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start 1076; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1077; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1078; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 1079; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1080; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 1081; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1082; GFX11-NEXT: buffer_gl1_inv 1083; GFX11-NEXT: buffer_gl0_inv 1084; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1085; GFX11-NEXT: v_mov_b32_e32 v4, v3 1086; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1087; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1088; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1089; GFX11-NEXT: s_cbranch_execnz .LBB5_1 1090; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1091; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1092; GFX11-NEXT: s_setpc_b64 s[30:31] 1093; 1094; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1095; GFX10: ; %bb.0: 1096; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1097; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1098; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1099; GFX10-NEXT: s_mov_b32 s4, 0 1100; GFX10-NEXT: flat_load_dword v4, v[0:1] 1101; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start 1102; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1103; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1104; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 1105; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1106; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1107; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1108; GFX10-NEXT: buffer_gl1_inv 1109; GFX10-NEXT: buffer_gl0_inv 1110; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1111; GFX10-NEXT: v_mov_b32_e32 v4, v3 1112; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1113; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1114; GFX10-NEXT: s_cbranch_execnz .LBB5_1 1115; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1116; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1117; GFX10-NEXT: s_setpc_b64 s[30:31] 1118; 1119; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1120; GFX90A: ; %bb.0: 1121; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1122; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 1123; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 1124; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1125; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1126; GFX90A-NEXT: flat_load_dword v1, v[0:1] 1127; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1128; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start 1129; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1130; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1131; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 1132; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 1133; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1134; GFX90A-NEXT: buffer_wbinvl1 1135; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1136; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1137; GFX90A-NEXT: v_mov_b32_e32 v1, v0 1138; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1139; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 1140; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1141; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1142; GFX90A-NEXT: s_setpc_b64 s[30:31] 1143; 1144; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1145; GFX908: ; %bb.0: 1146; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1147; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 1148; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 1149; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1150; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1151; GFX908-NEXT: flat_load_dword v1, v[0:1] 1152; GFX908-NEXT: s_mov_b64 s[4:5], 0 1153; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start 1154; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1155; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1156; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 1157; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1158; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1159; GFX908-NEXT: buffer_wbinvl1 1160; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1161; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1162; GFX908-NEXT: v_mov_b32_e32 v1, v0 1163; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1164; GFX908-NEXT: s_cbranch_execnz .LBB5_1 1165; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1166; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1167; GFX908-NEXT: s_setpc_b64 s[30:31] 1168; 1169; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1170; GFX8: ; %bb.0: 1171; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1172; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 1173; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 1174; GFX8-NEXT: flat_load_dword v4, v[0:1] 1175; GFX8-NEXT: s_mov_b64 s[4:5], 0 1176; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 1177; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1178; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1179; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 1180; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1181; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1182; GFX8-NEXT: buffer_wbinvl1 1183; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1184; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1185; GFX8-NEXT: v_mov_b32_e32 v4, v3 1186; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1187; GFX8-NEXT: s_cbranch_execnz .LBB5_1 1188; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1189; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1190; GFX8-NEXT: s_setpc_b64 s[30:31] 1191; 1192; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: 1193; GFX7: ; %bb.0: 1194; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1195; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 1196; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 1197; GFX7-NEXT: flat_load_dword v4, v[0:1] 1198; GFX7-NEXT: s_mov_b64 s[4:5], 0 1199; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start 1200; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1201; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1202; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 1203; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1204; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1205; GFX7-NEXT: buffer_wbinvl1 1206; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1207; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1208; GFX7-NEXT: v_mov_b32_e32 v4, v3 1209; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1210; GFX7-NEXT: s_cbranch_execnz .LBB5_1 1211; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1212; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1213; GFX7-NEXT: s_setpc_b64 s[30:31] 1214 %gep = getelementptr float, ptr %ptr, i64 -512 1215 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 1216 ret void 1217} 1218 1219define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 { 1220; GFX12-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1221; GFX12: ; %bb.0: 1222; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1223; GFX12-NEXT: s_wait_expcnt 0x0 1224; GFX12-NEXT: s_wait_samplecnt 0x0 1225; GFX12-NEXT: s_wait_bvhcnt 0x0 1226; GFX12-NEXT: s_wait_kmcnt 0x0 1227; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 1228; GFX12-NEXT: s_mov_b32 s0, 0 1229; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start 1230; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1231; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1232; GFX12-NEXT: v_mov_b32_e32 v4, v3 1233; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1234; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 1235; GFX12-NEXT: global_wb scope:SCOPE_SYS 1236; GFX12-NEXT: s_wait_storecnt 0x0 1237; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1238; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1239; GFX12-NEXT: global_inv scope:SCOPE_SYS 1240; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1241; GFX12-NEXT: s_wait_alu 0xfffe 1242; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1243; GFX12-NEXT: s_wait_alu 0xfffe 1244; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1245; GFX12-NEXT: s_cbranch_execnz .LBB6_1 1246; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1247; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1248; GFX12-NEXT: v_mov_b32_e32 v0, v3 1249; GFX12-NEXT: s_wait_alu 0xfffe 1250; GFX12-NEXT: s_setpc_b64 s[30:31] 1251; 1252; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1253; GFX940: ; %bb.0: 1254; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 1256; GFX940-NEXT: s_mov_b64 s[0:1], 0 1257; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start 1258; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1259; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1260; GFX940-NEXT: v_mov_b32_e32 v5, v3 1261; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 1262; GFX940-NEXT: buffer_wbl2 sc0 sc1 1263; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 1264; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1265; GFX940-NEXT: buffer_inv sc0 sc1 1266; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1267; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1268; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1269; GFX940-NEXT: s_cbranch_execnz .LBB6_1 1270; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1271; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1272; GFX940-NEXT: v_mov_b32_e32 v0, v3 1273; GFX940-NEXT: s_setpc_b64 s[30:31] 1274; 1275; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1276; GFX11: ; %bb.0: 1277; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1278; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 1279; GFX11-NEXT: s_mov_b32 s0, 0 1280; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start 1281; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1282; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1283; GFX11-NEXT: v_mov_b32_e32 v4, v3 1284; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1285; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 1286; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1287; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 1288; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1289; GFX11-NEXT: buffer_gl1_inv 1290; GFX11-NEXT: buffer_gl0_inv 1291; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1292; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1293; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1294; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1295; GFX11-NEXT: s_cbranch_execnz .LBB6_1 1296; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1297; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1298; GFX11-NEXT: v_mov_b32_e32 v0, v3 1299; GFX11-NEXT: s_setpc_b64 s[30:31] 1300; 1301; GFX10-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1302; GFX10: ; %bb.0: 1303; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1304; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 1305; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 1306; GFX10-NEXT: s_mov_b32 s4, 0 1307; GFX10-NEXT: flat_load_dword v0, v[3:4] 1308; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start 1309; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1310; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1311; GFX10-NEXT: v_mov_b32_e32 v1, v0 1312; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 1313; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1314; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1315; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1316; GFX10-NEXT: buffer_gl1_inv 1317; GFX10-NEXT: buffer_gl0_inv 1318; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 1319; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1320; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1321; GFX10-NEXT: s_cbranch_execnz .LBB6_1 1322; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1323; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1324; GFX10-NEXT: s_setpc_b64 s[30:31] 1325; 1326; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1327; GFX90A: ; %bb.0: 1328; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1329; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 1330; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1331; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start 1332; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1333; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1334; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1335; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 1336; GFX90A-NEXT: buffer_wbl2 1337; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1338; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1339; GFX90A-NEXT: buffer_invl2 1340; GFX90A-NEXT: buffer_wbinvl1 1341; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1342; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1343; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1344; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 1345; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1346; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1347; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1348; GFX90A-NEXT: s_setpc_b64 s[30:31] 1349; 1350; GFX908-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1351; GFX908: ; %bb.0: 1352; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1353; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1354; GFX908-NEXT: s_mov_b64 s[4:5], 0 1355; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 1356; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1357; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1358; GFX908-NEXT: v_mov_b32_e32 v4, v3 1359; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 1360; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1361; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1362; GFX908-NEXT: buffer_wbinvl1 1363; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1364; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1365; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1366; GFX908-NEXT: s_cbranch_execnz .LBB6_1 1367; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1368; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1369; GFX908-NEXT: v_mov_b32_e32 v0, v3 1370; GFX908-NEXT: s_setpc_b64 s[30:31] 1371; 1372; GFX8-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1373; GFX8: ; %bb.0: 1374; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1375; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 1376; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1377; GFX8-NEXT: flat_load_dword v0, v[3:4] 1378; GFX8-NEXT: s_mov_b64 s[4:5], 0 1379; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 1380; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1381; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1382; GFX8-NEXT: v_mov_b32_e32 v1, v0 1383; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 1384; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1385; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1386; GFX8-NEXT: buffer_wbinvl1 1387; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1388; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1389; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1390; GFX8-NEXT: s_cbranch_execnz .LBB6_1 1391; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1392; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1393; GFX8-NEXT: s_setpc_b64 s[30:31] 1394; 1395; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: 1396; GFX7: ; %bb.0: 1397; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1398; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 1399; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1400; GFX7-NEXT: flat_load_dword v0, v[3:4] 1401; GFX7-NEXT: s_mov_b64 s[4:5], 0 1402; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start 1403; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1404; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1405; GFX7-NEXT: v_mov_b32_e32 v1, v0 1406; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 1407; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1408; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1409; GFX7-NEXT: buffer_wbinvl1 1410; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1411; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1412; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1413; GFX7-NEXT: s_cbranch_execnz .LBB6_1 1414; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1415; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1416; GFX7-NEXT: s_setpc_b64 s[30:31] 1417 %gep = getelementptr float, ptr %ptr, i64 511 1418 %result = atomicrmw fsub ptr %gep, float %val seq_cst 1419 ret float %result 1420} 1421 1422define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 { 1423; GFX12-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1424; GFX12: ; %bb.0: 1425; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1426; GFX12-NEXT: s_wait_expcnt 0x0 1427; GFX12-NEXT: s_wait_samplecnt 0x0 1428; GFX12-NEXT: s_wait_bvhcnt 0x0 1429; GFX12-NEXT: s_wait_kmcnt 0x0 1430; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 1431; GFX12-NEXT: s_mov_b32 s0, 0 1432; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start 1433; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1434; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1435; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 1436; GFX12-NEXT: global_wb scope:SCOPE_SYS 1437; GFX12-NEXT: s_wait_storecnt 0x0 1438; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 1439; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1440; GFX12-NEXT: global_inv scope:SCOPE_SYS 1441; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1442; GFX12-NEXT: v_mov_b32_e32 v4, v3 1443; GFX12-NEXT: s_wait_alu 0xfffe 1444; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1445; GFX12-NEXT: s_wait_alu 0xfffe 1446; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1447; GFX12-NEXT: s_cbranch_execnz .LBB7_1 1448; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1449; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1450; GFX12-NEXT: s_wait_alu 0xfffe 1451; GFX12-NEXT: s_setpc_b64 s[30:31] 1452; 1453; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1454; GFX940: ; %bb.0: 1455; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1456; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 1457; GFX940-NEXT: s_mov_b64 s[0:1], 0 1458; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start 1459; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1460; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1461; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 1462; GFX940-NEXT: buffer_wbl2 sc0 sc1 1463; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 1464; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1465; GFX940-NEXT: buffer_inv sc0 sc1 1466; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1467; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1468; GFX940-NEXT: v_mov_b32_e32 v5, v3 1469; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1470; GFX940-NEXT: s_cbranch_execnz .LBB7_1 1471; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1472; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1473; GFX940-NEXT: s_setpc_b64 s[30:31] 1474; 1475; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1476; GFX11: ; %bb.0: 1477; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1478; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 1479; GFX11-NEXT: s_mov_b32 s0, 0 1480; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start 1481; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1482; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1483; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 1484; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1485; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 1486; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1487; GFX11-NEXT: buffer_gl1_inv 1488; GFX11-NEXT: buffer_gl0_inv 1489; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1490; GFX11-NEXT: v_mov_b32_e32 v4, v3 1491; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1492; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1493; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1494; GFX11-NEXT: s_cbranch_execnz .LBB7_1 1495; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1496; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1497; GFX11-NEXT: s_setpc_b64 s[30:31] 1498; 1499; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1500; GFX10: ; %bb.0: 1501; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1502; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1503; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1504; GFX10-NEXT: s_mov_b32 s4, 0 1505; GFX10-NEXT: flat_load_dword v4, v[0:1] 1506; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start 1507; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1508; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1509; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 1510; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1511; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1512; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1513; GFX10-NEXT: buffer_gl1_inv 1514; GFX10-NEXT: buffer_gl0_inv 1515; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1516; GFX10-NEXT: v_mov_b32_e32 v4, v3 1517; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1518; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1519; GFX10-NEXT: s_cbranch_execnz .LBB7_1 1520; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1521; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1522; GFX10-NEXT: s_setpc_b64 s[30:31] 1523; 1524; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1525; GFX90A: ; %bb.0: 1526; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1527; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 1528; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1529; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start 1530; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1531; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1532; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 1533; GFX90A-NEXT: buffer_wbl2 1534; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1535; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1536; GFX90A-NEXT: buffer_invl2 1537; GFX90A-NEXT: buffer_wbinvl1 1538; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1539; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1540; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1541; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1542; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 1543; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1544; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1545; GFX90A-NEXT: s_setpc_b64 s[30:31] 1546; 1547; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1548; GFX908: ; %bb.0: 1549; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1550; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 1551; GFX908-NEXT: s_mov_b64 s[4:5], 0 1552; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start 1553; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1554; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1555; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 1556; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1557; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1558; GFX908-NEXT: buffer_wbinvl1 1559; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1560; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1561; GFX908-NEXT: v_mov_b32_e32 v4, v3 1562; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1563; GFX908-NEXT: s_cbranch_execnz .LBB7_1 1564; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1565; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1566; GFX908-NEXT: s_setpc_b64 s[30:31] 1567; 1568; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1569; GFX8: ; %bb.0: 1570; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1571; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 1572; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1573; GFX8-NEXT: flat_load_dword v4, v[0:1] 1574; GFX8-NEXT: s_mov_b64 s[4:5], 0 1575; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start 1576; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1577; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1578; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 1579; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1580; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1581; GFX8-NEXT: buffer_wbinvl1 1582; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1583; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1584; GFX8-NEXT: v_mov_b32_e32 v4, v3 1585; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1586; GFX8-NEXT: s_cbranch_execnz .LBB7_1 1587; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1588; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1589; GFX8-NEXT: s_setpc_b64 s[30:31] 1590; 1591; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: 1592; GFX7: ; %bb.0: 1593; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1594; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1595; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1596; GFX7-NEXT: flat_load_dword v4, v[0:1] 1597; GFX7-NEXT: s_mov_b64 s[4:5], 0 1598; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start 1599; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1600; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1601; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 1602; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1603; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1604; GFX7-NEXT: buffer_wbinvl1 1605; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1606; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1607; GFX7-NEXT: v_mov_b32_e32 v4, v3 1608; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1609; GFX7-NEXT: s_cbranch_execnz .LBB7_1 1610; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1611; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1612; GFX7-NEXT: s_setpc_b64 s[30:31] 1613 %gep = getelementptr float, ptr %ptr, i64 511 1614 %unused = atomicrmw fsub ptr %gep, float %val seq_cst 1615 ret void 1616} 1617 1618; -------------------------------------------------------------------- 1619; float with ftz/daz 1620; -------------------------------------------------------------------- 1621 1622define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { 1623; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1624; GFX12: ; %bb.0: 1625; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1626; GFX12-NEXT: s_wait_expcnt 0x0 1627; GFX12-NEXT: s_wait_samplecnt 0x0 1628; GFX12-NEXT: s_wait_bvhcnt 0x0 1629; GFX12-NEXT: s_wait_kmcnt 0x0 1630; GFX12-NEXT: flat_load_b32 v3, v[0:1] 1631; GFX12-NEXT: s_mov_b32 s0, 0 1632; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start 1633; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1634; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1635; GFX12-NEXT: v_mov_b32_e32 v4, v3 1636; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1637; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 1638; GFX12-NEXT: s_wait_storecnt 0x0 1639; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1640; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1641; GFX12-NEXT: global_inv scope:SCOPE_DEV 1642; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1643; GFX12-NEXT: s_wait_alu 0xfffe 1644; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1645; GFX12-NEXT: s_wait_alu 0xfffe 1646; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1647; GFX12-NEXT: s_cbranch_execnz .LBB8_1 1648; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1649; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1650; GFX12-NEXT: v_mov_b32_e32 v0, v3 1651; GFX12-NEXT: s_wait_alu 0xfffe 1652; GFX12-NEXT: s_setpc_b64 s[30:31] 1653; 1654; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1655; GFX940: ; %bb.0: 1656; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1657; GFX940-NEXT: flat_load_dword v3, v[0:1] 1658; GFX940-NEXT: s_mov_b64 s[0:1], 0 1659; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start 1660; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1661; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1662; GFX940-NEXT: v_mov_b32_e32 v5, v3 1663; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 1664; GFX940-NEXT: buffer_wbl2 sc1 1665; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 1666; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1667; GFX940-NEXT: buffer_inv sc1 1668; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1669; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1670; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1671; GFX940-NEXT: s_cbranch_execnz .LBB8_1 1672; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1673; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1674; GFX940-NEXT: v_mov_b32_e32 v0, v3 1675; GFX940-NEXT: s_setpc_b64 s[30:31] 1676; 1677; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1678; GFX11: ; %bb.0: 1679; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1680; GFX11-NEXT: flat_load_b32 v3, v[0:1] 1681; GFX11-NEXT: s_mov_b32 s0, 0 1682; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 1683; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1684; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1685; GFX11-NEXT: v_mov_b32_e32 v4, v3 1686; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1687; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 1688; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1689; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 1690; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1691; GFX11-NEXT: buffer_gl1_inv 1692; GFX11-NEXT: buffer_gl0_inv 1693; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1694; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1695; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1696; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1697; GFX11-NEXT: s_cbranch_execnz .LBB8_1 1698; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1699; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1700; GFX11-NEXT: v_mov_b32_e32 v0, v3 1701; GFX11-NEXT: s_setpc_b64 s[30:31] 1702; 1703; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1704; GFX10: ; %bb.0: 1705; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1706; GFX10-NEXT: flat_load_dword v3, v[0:1] 1707; GFX10-NEXT: s_mov_b32 s4, 0 1708; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 1709; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1710; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1711; GFX10-NEXT: v_mov_b32_e32 v4, v3 1712; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 1713; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1714; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1715; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1716; GFX10-NEXT: buffer_gl1_inv 1717; GFX10-NEXT: buffer_gl0_inv 1718; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1719; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1720; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1721; GFX10-NEXT: s_cbranch_execnz .LBB8_1 1722; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1723; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1724; GFX10-NEXT: v_mov_b32_e32 v0, v3 1725; GFX10-NEXT: s_setpc_b64 s[30:31] 1726; 1727; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1728; GFX90A: ; %bb.0: 1729; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1730; GFX90A-NEXT: flat_load_dword v3, v[0:1] 1731; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1732; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start 1733; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1734; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1735; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1736; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 1737; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 1738; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1739; GFX90A-NEXT: buffer_wbinvl1 1740; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1741; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1742; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1743; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 1744; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1745; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1746; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1747; GFX90A-NEXT: s_setpc_b64 s[30:31] 1748; 1749; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1750; GFX908: ; %bb.0: 1751; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1752; GFX908-NEXT: flat_load_dword v3, v[0:1] 1753; GFX908-NEXT: s_mov_b64 s[4:5], 0 1754; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 1755; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1756; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1757; GFX908-NEXT: v_mov_b32_e32 v4, v3 1758; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 1759; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1760; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1761; GFX908-NEXT: buffer_wbinvl1 1762; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1763; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1764; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1765; GFX908-NEXT: s_cbranch_execnz .LBB8_1 1766; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1767; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1768; GFX908-NEXT: v_mov_b32_e32 v0, v3 1769; GFX908-NEXT: s_setpc_b64 s[30:31] 1770; 1771; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1772; GFX8: ; %bb.0: 1773; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1774; GFX8-NEXT: flat_load_dword v3, v[0:1] 1775; GFX8-NEXT: s_mov_b64 s[4:5], 0 1776; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 1777; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1778; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1779; GFX8-NEXT: v_mov_b32_e32 v4, v3 1780; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 1781; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1782; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1783; GFX8-NEXT: buffer_wbinvl1 1784; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1785; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1786; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1787; GFX8-NEXT: s_cbranch_execnz .LBB8_1 1788; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1789; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1790; GFX8-NEXT: v_mov_b32_e32 v0, v3 1791; GFX8-NEXT: s_setpc_b64 s[30:31] 1792; 1793; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: 1794; GFX7: ; %bb.0: 1795; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1796; GFX7-NEXT: flat_load_dword v3, v[0:1] 1797; GFX7-NEXT: s_mov_b64 s[4:5], 0 1798; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start 1799; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1800; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1801; GFX7-NEXT: v_mov_b32_e32 v4, v3 1802; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 1803; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1804; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1805; GFX7-NEXT: buffer_wbinvl1 1806; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1807; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1808; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1809; GFX7-NEXT: s_cbranch_execnz .LBB8_1 1810; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1811; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1812; GFX7-NEXT: v_mov_b32_e32 v0, v3 1813; GFX7-NEXT: s_setpc_b64 s[30:31] 1814 %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst 1815 ret float %result 1816} 1817 1818define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 { 1819; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1820; GFX12: ; %bb.0: 1821; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1822; GFX12-NEXT: s_wait_expcnt 0x0 1823; GFX12-NEXT: s_wait_samplecnt 0x0 1824; GFX12-NEXT: s_wait_bvhcnt 0x0 1825; GFX12-NEXT: s_wait_kmcnt 0x0 1826; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 1827; GFX12-NEXT: s_mov_b32 s0, 0 1828; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start 1829; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1830; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1831; GFX12-NEXT: v_mov_b32_e32 v4, v3 1832; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1833; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 1834; GFX12-NEXT: s_wait_storecnt 0x0 1835; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1836; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1837; GFX12-NEXT: global_inv scope:SCOPE_DEV 1838; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1839; GFX12-NEXT: s_wait_alu 0xfffe 1840; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1841; GFX12-NEXT: s_wait_alu 0xfffe 1842; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1843; GFX12-NEXT: s_cbranch_execnz .LBB9_1 1844; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1845; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1846; GFX12-NEXT: v_mov_b32_e32 v0, v3 1847; GFX12-NEXT: s_wait_alu 0xfffe 1848; GFX12-NEXT: s_setpc_b64 s[30:31] 1849; 1850; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1851; GFX940: ; %bb.0: 1852; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1853; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 1854; GFX940-NEXT: s_mov_b64 s[0:1], 0 1855; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start 1856; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1857; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1858; GFX940-NEXT: v_mov_b32_e32 v5, v3 1859; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 1860; GFX940-NEXT: buffer_wbl2 sc1 1861; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 1862; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1863; GFX940-NEXT: buffer_inv sc1 1864; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1865; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1866; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1867; GFX940-NEXT: s_cbranch_execnz .LBB9_1 1868; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1869; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1870; GFX940-NEXT: v_mov_b32_e32 v0, v3 1871; GFX940-NEXT: s_setpc_b64 s[30:31] 1872; 1873; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1874; GFX11: ; %bb.0: 1875; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1876; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 1877; GFX11-NEXT: s_mov_b32 s0, 0 1878; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start 1879; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1880; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1881; GFX11-NEXT: v_mov_b32_e32 v4, v3 1882; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1883; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 1884; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1885; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 1886; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1887; GFX11-NEXT: buffer_gl1_inv 1888; GFX11-NEXT: buffer_gl0_inv 1889; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1890; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1891; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1892; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1893; GFX11-NEXT: s_cbranch_execnz .LBB9_1 1894; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1895; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1896; GFX11-NEXT: v_mov_b32_e32 v0, v3 1897; GFX11-NEXT: s_setpc_b64 s[30:31] 1898; 1899; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1900; GFX10: ; %bb.0: 1901; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1902; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 1903; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 1904; GFX10-NEXT: s_mov_b32 s4, 0 1905; GFX10-NEXT: flat_load_dword v0, v[3:4] 1906; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start 1907; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1908; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1909; GFX10-NEXT: v_mov_b32_e32 v1, v0 1910; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 1911; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1912; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1913; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1914; GFX10-NEXT: buffer_gl1_inv 1915; GFX10-NEXT: buffer_gl0_inv 1916; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 1917; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1918; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1919; GFX10-NEXT: s_cbranch_execnz .LBB9_1 1920; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1921; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1922; GFX10-NEXT: s_setpc_b64 s[30:31] 1923; 1924; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1925; GFX90A: ; %bb.0: 1926; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1927; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 1928; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1929; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start 1930; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1931; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1932; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1933; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 1934; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1935; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1936; GFX90A-NEXT: buffer_wbinvl1 1937; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1938; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1939; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1940; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 1941; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1942; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1943; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1944; GFX90A-NEXT: s_setpc_b64 s[30:31] 1945; 1946; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1947; GFX908: ; %bb.0: 1948; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1949; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1950; GFX908-NEXT: s_mov_b64 s[4:5], 0 1951; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 1952; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1953; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1954; GFX908-NEXT: v_mov_b32_e32 v4, v3 1955; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 1956; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1957; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1958; GFX908-NEXT: buffer_wbinvl1 1959; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1960; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1961; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1962; GFX908-NEXT: s_cbranch_execnz .LBB9_1 1963; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1964; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1965; GFX908-NEXT: v_mov_b32_e32 v0, v3 1966; GFX908-NEXT: s_setpc_b64 s[30:31] 1967; 1968; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1969; GFX8: ; %bb.0: 1970; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1971; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 1972; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1973; GFX8-NEXT: flat_load_dword v0, v[3:4] 1974; GFX8-NEXT: s_mov_b64 s[4:5], 0 1975; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 1976; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1977; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1978; GFX8-NEXT: v_mov_b32_e32 v1, v0 1979; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 1980; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 1981; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1982; GFX8-NEXT: buffer_wbinvl1 1983; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 1984; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1985; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1986; GFX8-NEXT: s_cbranch_execnz .LBB9_1 1987; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1988; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1989; GFX8-NEXT: s_setpc_b64 s[30:31] 1990; 1991; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: 1992; GFX7: ; %bb.0: 1993; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1994; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 1995; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1996; GFX7-NEXT: flat_load_dword v0, v[3:4] 1997; GFX7-NEXT: s_mov_b64 s[4:5], 0 1998; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start 1999; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2000; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2001; GFX7-NEXT: v_mov_b32_e32 v1, v0 2002; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 2003; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2004; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2005; GFX7-NEXT: buffer_wbinvl1 2006; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2007; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2008; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2009; GFX7-NEXT: s_cbranch_execnz .LBB9_1 2010; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2011; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2012; GFX7-NEXT: s_setpc_b64 s[30:31] 2013 %gep = getelementptr float, ptr %ptr, i64 511 2014 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 2015 ret float %result 2016} 2017 2018define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 { 2019; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2020; GFX12: ; %bb.0: 2021; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2022; GFX12-NEXT: s_wait_expcnt 0x0 2023; GFX12-NEXT: s_wait_samplecnt 0x0 2024; GFX12-NEXT: s_wait_bvhcnt 0x0 2025; GFX12-NEXT: s_wait_kmcnt 0x0 2026; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 2027; GFX12-NEXT: s_mov_b32 s0, 0 2028; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start 2029; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2030; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2031; GFX12-NEXT: v_mov_b32_e32 v4, v3 2032; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2033; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 2034; GFX12-NEXT: s_wait_storecnt 0x0 2035; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2036; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2037; GFX12-NEXT: global_inv scope:SCOPE_DEV 2038; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2039; GFX12-NEXT: s_wait_alu 0xfffe 2040; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 2041; GFX12-NEXT: s_wait_alu 0xfffe 2042; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2043; GFX12-NEXT: s_cbranch_execnz .LBB10_1 2044; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2045; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2046; GFX12-NEXT: v_mov_b32_e32 v0, v3 2047; GFX12-NEXT: s_wait_alu 0xfffe 2048; GFX12-NEXT: s_setpc_b64 s[30:31] 2049; 2050; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2051; GFX940: ; %bb.0: 2052; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2053; GFX940-NEXT: v_mov_b32_e32 v4, v0 2054; GFX940-NEXT: v_mov_b32_e32 v5, v1 2055; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 2056; GFX940-NEXT: s_movk_i32 s0, 0xf800 2057; GFX940-NEXT: s_nop 0 2058; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 2059; GFX940-NEXT: flat_load_dword v0, v[0:1] 2060; GFX940-NEXT: s_mov_b32 s1, -1 2061; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 2062; GFX940-NEXT: s_mov_b64 s[0:1], 0 2063; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start 2064; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2065; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2066; GFX940-NEXT: v_mov_b32_e32 v1, v0 2067; GFX940-NEXT: v_sub_f32_e32 v0, v1, v2 2068; GFX940-NEXT: buffer_wbl2 sc1 2069; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 2070; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2071; GFX940-NEXT: buffer_inv sc1 2072; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2073; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2074; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2075; GFX940-NEXT: s_cbranch_execnz .LBB10_1 2076; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2077; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2078; GFX940-NEXT: s_setpc_b64 s[30:31] 2079; 2080; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2081; GFX11: ; %bb.0: 2082; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2083; GFX11-NEXT: v_mov_b32_e32 v3, v0 2084; GFX11-NEXT: s_mov_b32 s0, 0 2085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2086; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 2087; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 2088; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 2089; GFX11-NEXT: flat_load_b32 v0, v[4:5] 2090; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 2091; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start 2092; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2093; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2094; GFX11-NEXT: v_mov_b32_e32 v1, v0 2095; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2096; GFX11-NEXT: v_sub_f32_e32 v0, v1, v2 2097; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2098; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc 2099; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2100; GFX11-NEXT: buffer_gl1_inv 2101; GFX11-NEXT: buffer_gl0_inv 2102; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 2103; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2104; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2105; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2106; GFX11-NEXT: s_cbranch_execnz .LBB10_1 2107; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2108; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2109; GFX11-NEXT: s_setpc_b64 s[30:31] 2110; 2111; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2112; GFX10: ; %bb.0: 2113; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2114; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 2115; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 2116; GFX10-NEXT: s_mov_b32 s4, 0 2117; GFX10-NEXT: flat_load_dword v0, v[3:4] 2118; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start 2119; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2120; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2121; GFX10-NEXT: v_mov_b32_e32 v1, v0 2122; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 2123; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2124; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2125; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2126; GFX10-NEXT: buffer_gl1_inv 2127; GFX10-NEXT: buffer_gl0_inv 2128; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 2129; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2130; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2131; GFX10-NEXT: s_cbranch_execnz .LBB10_1 2132; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2133; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2134; GFX10-NEXT: s_setpc_b64 s[30:31] 2135; 2136; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2137; GFX90A: ; %bb.0: 2138; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2139; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2140; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2141; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2142; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2143; GFX90A-NEXT: flat_load_dword v0, v[0:1] 2144; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2145; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start 2146; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2147; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2148; GFX90A-NEXT: v_mov_b32_e32 v1, v0 2149; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 2150; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 2151; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2152; GFX90A-NEXT: buffer_wbinvl1 2153; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2154; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2155; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2156; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 2157; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2158; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2159; GFX90A-NEXT: s_setpc_b64 s[30:31] 2160; 2161; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2162; GFX908: ; %bb.0: 2163; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2164; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 2165; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 2166; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2167; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2168; GFX908-NEXT: flat_load_dword v0, v[0:1] 2169; GFX908-NEXT: s_mov_b64 s[4:5], 0 2170; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start 2171; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2172; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2173; GFX908-NEXT: v_mov_b32_e32 v1, v0 2174; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 2175; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2176; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2177; GFX908-NEXT: buffer_wbinvl1 2178; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2179; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2180; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2181; GFX908-NEXT: s_cbranch_execnz .LBB10_1 2182; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2183; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2184; GFX908-NEXT: s_setpc_b64 s[30:31] 2185; 2186; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2187; GFX8: ; %bb.0: 2188; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2189; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 2190; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 2191; GFX8-NEXT: flat_load_dword v0, v[3:4] 2192; GFX8-NEXT: s_mov_b64 s[4:5], 0 2193; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start 2194; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2195; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2196; GFX8-NEXT: v_mov_b32_e32 v1, v0 2197; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 2198; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2199; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2200; GFX8-NEXT: buffer_wbinvl1 2201; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2202; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2203; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2204; GFX8-NEXT: s_cbranch_execnz .LBB10_1 2205; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2206; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2207; GFX8-NEXT: s_setpc_b64 s[30:31] 2208; 2209; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: 2210; GFX7: ; %bb.0: 2211; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2212; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 2213; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 2214; GFX7-NEXT: flat_load_dword v0, v[3:4] 2215; GFX7-NEXT: s_mov_b64 s[4:5], 0 2216; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start 2217; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2218; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2219; GFX7-NEXT: v_mov_b32_e32 v1, v0 2220; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 2221; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2222; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2223; GFX7-NEXT: buffer_wbinvl1 2224; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2225; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2226; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2227; GFX7-NEXT: s_cbranch_execnz .LBB10_1 2228; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2229; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2230; GFX7-NEXT: s_setpc_b64 s[30:31] 2231 %gep = getelementptr float, ptr %ptr, i64 -512 2232 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 2233 ret float %result 2234} 2235 2236define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { 2237; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2238; GFX12: ; %bb.0: 2239; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2240; GFX12-NEXT: s_wait_expcnt 0x0 2241; GFX12-NEXT: s_wait_samplecnt 0x0 2242; GFX12-NEXT: s_wait_bvhcnt 0x0 2243; GFX12-NEXT: s_wait_kmcnt 0x0 2244; GFX12-NEXT: flat_load_b32 v4, v[0:1] 2245; GFX12-NEXT: s_mov_b32 s0, 0 2246; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start 2247; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2248; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2249; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 2250; GFX12-NEXT: s_wait_storecnt 0x0 2251; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2252; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2253; GFX12-NEXT: global_inv scope:SCOPE_DEV 2254; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2255; GFX12-NEXT: v_mov_b32_e32 v4, v3 2256; GFX12-NEXT: s_wait_alu 0xfffe 2257; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 2258; GFX12-NEXT: s_wait_alu 0xfffe 2259; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2260; GFX12-NEXT: s_cbranch_execnz .LBB11_1 2261; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2262; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2263; GFX12-NEXT: s_wait_alu 0xfffe 2264; GFX12-NEXT: s_setpc_b64 s[30:31] 2265; 2266; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2267; GFX940: ; %bb.0: 2268; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2269; GFX940-NEXT: flat_load_dword v5, v[0:1] 2270; GFX940-NEXT: s_mov_b64 s[0:1], 0 2271; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start 2272; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2273; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2274; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 2275; GFX940-NEXT: buffer_wbl2 sc1 2276; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 2277; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2278; GFX940-NEXT: buffer_inv sc1 2279; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2280; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2281; GFX940-NEXT: v_mov_b32_e32 v5, v3 2282; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2283; GFX940-NEXT: s_cbranch_execnz .LBB11_1 2284; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2285; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2286; GFX940-NEXT: s_setpc_b64 s[30:31] 2287; 2288; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2289; GFX11: ; %bb.0: 2290; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2291; GFX11-NEXT: flat_load_b32 v4, v[0:1] 2292; GFX11-NEXT: s_mov_b32 s0, 0 2293; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 2294; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2295; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2296; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 2297; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2298; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 2299; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2300; GFX11-NEXT: buffer_gl1_inv 2301; GFX11-NEXT: buffer_gl0_inv 2302; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2303; GFX11-NEXT: v_mov_b32_e32 v4, v3 2304; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2305; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2306; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2307; GFX11-NEXT: s_cbranch_execnz .LBB11_1 2308; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2309; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2310; GFX11-NEXT: s_setpc_b64 s[30:31] 2311; 2312; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2313; GFX10: ; %bb.0: 2314; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2315; GFX10-NEXT: flat_load_dword v4, v[0:1] 2316; GFX10-NEXT: s_mov_b32 s4, 0 2317; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start 2318; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2319; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2320; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 2321; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2322; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2323; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2324; GFX10-NEXT: buffer_gl1_inv 2325; GFX10-NEXT: buffer_gl0_inv 2326; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2327; GFX10-NEXT: v_mov_b32_e32 v4, v3 2328; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2329; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2330; GFX10-NEXT: s_cbranch_execnz .LBB11_1 2331; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2332; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2333; GFX10-NEXT: s_setpc_b64 s[30:31] 2334; 2335; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2336; GFX90A: ; %bb.0: 2337; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2338; GFX90A-NEXT: flat_load_dword v5, v[0:1] 2339; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2340; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start 2341; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2342; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2343; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 2344; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 2345; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2346; GFX90A-NEXT: buffer_wbinvl1 2347; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2348; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2349; GFX90A-NEXT: v_mov_b32_e32 v5, v3 2350; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2351; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 2352; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2353; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2354; GFX90A-NEXT: s_setpc_b64 s[30:31] 2355; 2356; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2357; GFX908: ; %bb.0: 2358; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2359; GFX908-NEXT: flat_load_dword v4, v[0:1] 2360; GFX908-NEXT: s_mov_b64 s[4:5], 0 2361; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 2362; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2363; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2364; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 2365; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2366; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2367; GFX908-NEXT: buffer_wbinvl1 2368; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2369; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2370; GFX908-NEXT: v_mov_b32_e32 v4, v3 2371; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2372; GFX908-NEXT: s_cbranch_execnz .LBB11_1 2373; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2374; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2375; GFX908-NEXT: s_setpc_b64 s[30:31] 2376; 2377; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2378; GFX8: ; %bb.0: 2379; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2380; GFX8-NEXT: flat_load_dword v4, v[0:1] 2381; GFX8-NEXT: s_mov_b64 s[4:5], 0 2382; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 2383; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2384; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2385; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 2386; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2387; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2388; GFX8-NEXT: buffer_wbinvl1 2389; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2390; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2391; GFX8-NEXT: v_mov_b32_e32 v4, v3 2392; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2393; GFX8-NEXT: s_cbranch_execnz .LBB11_1 2394; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2395; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2396; GFX8-NEXT: s_setpc_b64 s[30:31] 2397; 2398; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: 2399; GFX7: ; %bb.0: 2400; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2401; GFX7-NEXT: flat_load_dword v4, v[0:1] 2402; GFX7-NEXT: s_mov_b64 s[4:5], 0 2403; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start 2404; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2405; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2406; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 2407; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2408; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2409; GFX7-NEXT: buffer_wbinvl1 2410; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2411; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2412; GFX7-NEXT: v_mov_b32_e32 v4, v3 2413; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2414; GFX7-NEXT: s_cbranch_execnz .LBB11_1 2415; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2416; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2417; GFX7-NEXT: s_setpc_b64 s[30:31] 2418 %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst 2419 ret void 2420} 2421 2422define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 { 2423; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2424; GFX12: ; %bb.0: 2425; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2426; GFX12-NEXT: s_wait_expcnt 0x0 2427; GFX12-NEXT: s_wait_samplecnt 0x0 2428; GFX12-NEXT: s_wait_bvhcnt 0x0 2429; GFX12-NEXT: s_wait_kmcnt 0x0 2430; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 2431; GFX12-NEXT: s_mov_b32 s0, 0 2432; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start 2433; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2434; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2435; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 2436; GFX12-NEXT: s_wait_storecnt 0x0 2437; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2438; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2439; GFX12-NEXT: global_inv scope:SCOPE_DEV 2440; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2441; GFX12-NEXT: v_mov_b32_e32 v4, v3 2442; GFX12-NEXT: s_wait_alu 0xfffe 2443; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 2444; GFX12-NEXT: s_wait_alu 0xfffe 2445; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2446; GFX12-NEXT: s_cbranch_execnz .LBB12_1 2447; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2448; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2449; GFX12-NEXT: s_wait_alu 0xfffe 2450; GFX12-NEXT: s_setpc_b64 s[30:31] 2451; 2452; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2453; GFX940: ; %bb.0: 2454; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2455; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 2456; GFX940-NEXT: s_mov_b64 s[0:1], 0 2457; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start 2458; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2459; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2460; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 2461; GFX940-NEXT: buffer_wbl2 sc1 2462; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 2463; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2464; GFX940-NEXT: buffer_inv sc1 2465; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2466; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2467; GFX940-NEXT: v_mov_b32_e32 v5, v3 2468; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2469; GFX940-NEXT: s_cbranch_execnz .LBB12_1 2470; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2471; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2472; GFX940-NEXT: s_setpc_b64 s[30:31] 2473; 2474; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2475; GFX11: ; %bb.0: 2476; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2477; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 2478; GFX11-NEXT: s_mov_b32 s0, 0 2479; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start 2480; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2481; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2482; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 2483; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2484; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 2485; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2486; GFX11-NEXT: buffer_gl1_inv 2487; GFX11-NEXT: buffer_gl0_inv 2488; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2489; GFX11-NEXT: v_mov_b32_e32 v4, v3 2490; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2491; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2492; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2493; GFX11-NEXT: s_cbranch_execnz .LBB12_1 2494; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2495; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2496; GFX11-NEXT: s_setpc_b64 s[30:31] 2497; 2498; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2499; GFX10: ; %bb.0: 2500; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2501; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 2502; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2503; GFX10-NEXT: s_mov_b32 s4, 0 2504; GFX10-NEXT: flat_load_dword v4, v[0:1] 2505; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start 2506; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2507; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2508; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 2509; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2510; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2511; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2512; GFX10-NEXT: buffer_gl1_inv 2513; GFX10-NEXT: buffer_gl0_inv 2514; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2515; GFX10-NEXT: v_mov_b32_e32 v4, v3 2516; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2517; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2518; GFX10-NEXT: s_cbranch_execnz .LBB12_1 2519; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2520; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2521; GFX10-NEXT: s_setpc_b64 s[30:31] 2522; 2523; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2524; GFX90A: ; %bb.0: 2525; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2526; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 2527; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2528; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start 2529; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2530; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2531; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 2532; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 2533; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2534; GFX90A-NEXT: buffer_wbinvl1 2535; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2536; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2537; GFX90A-NEXT: v_mov_b32_e32 v5, v3 2538; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2539; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 2540; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2541; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2542; GFX90A-NEXT: s_setpc_b64 s[30:31] 2543; 2544; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2545; GFX908: ; %bb.0: 2546; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2547; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 2548; GFX908-NEXT: s_mov_b64 s[4:5], 0 2549; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start 2550; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2551; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2552; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 2553; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 2554; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2555; GFX908-NEXT: buffer_wbinvl1 2556; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2557; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2558; GFX908-NEXT: v_mov_b32_e32 v4, v3 2559; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2560; GFX908-NEXT: s_cbranch_execnz .LBB12_1 2561; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2562; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2563; GFX908-NEXT: s_setpc_b64 s[30:31] 2564; 2565; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2566; GFX8: ; %bb.0: 2567; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2568; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 2569; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2570; GFX8-NEXT: flat_load_dword v4, v[0:1] 2571; GFX8-NEXT: s_mov_b64 s[4:5], 0 2572; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start 2573; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2574; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2575; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 2576; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2577; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2578; GFX8-NEXT: buffer_wbinvl1 2579; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2580; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2581; GFX8-NEXT: v_mov_b32_e32 v4, v3 2582; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2583; GFX8-NEXT: s_cbranch_execnz .LBB12_1 2584; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2585; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2586; GFX8-NEXT: s_setpc_b64 s[30:31] 2587; 2588; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: 2589; GFX7: ; %bb.0: 2590; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2591; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2592; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2593; GFX7-NEXT: flat_load_dword v4, v[0:1] 2594; GFX7-NEXT: s_mov_b64 s[4:5], 0 2595; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start 2596; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2597; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2598; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 2599; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2600; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2601; GFX7-NEXT: buffer_wbinvl1 2602; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2603; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2604; GFX7-NEXT: v_mov_b32_e32 v4, v3 2605; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2606; GFX7-NEXT: s_cbranch_execnz .LBB12_1 2607; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2608; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2609; GFX7-NEXT: s_setpc_b64 s[30:31] 2610 %gep = getelementptr float, ptr %ptr, i64 511 2611 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 2612 ret void 2613} 2614 2615define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 { 2616; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2617; GFX12: ; %bb.0: 2618; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2619; GFX12-NEXT: s_wait_expcnt 0x0 2620; GFX12-NEXT: s_wait_samplecnt 0x0 2621; GFX12-NEXT: s_wait_bvhcnt 0x0 2622; GFX12-NEXT: s_wait_kmcnt 0x0 2623; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 2624; GFX12-NEXT: s_mov_b32 s0, 0 2625; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start 2626; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2627; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2628; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 2629; GFX12-NEXT: s_wait_storecnt 0x0 2630; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2631; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2632; GFX12-NEXT: global_inv scope:SCOPE_DEV 2633; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2634; GFX12-NEXT: v_mov_b32_e32 v4, v3 2635; GFX12-NEXT: s_wait_alu 0xfffe 2636; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 2637; GFX12-NEXT: s_wait_alu 0xfffe 2638; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2639; GFX12-NEXT: s_cbranch_execnz .LBB13_1 2640; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2641; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2642; GFX12-NEXT: s_wait_alu 0xfffe 2643; GFX12-NEXT: s_setpc_b64 s[30:31] 2644; 2645; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2646; GFX940: ; %bb.0: 2647; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2648; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2649; GFX940-NEXT: s_movk_i32 s0, 0xf800 2650; GFX940-NEXT: s_nop 0 2651; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2652; GFX940-NEXT: flat_load_dword v5, v[4:5] 2653; GFX940-NEXT: s_mov_b32 s1, -1 2654; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 2655; GFX940-NEXT: s_mov_b64 s[0:1], 0 2656; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start 2657; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2658; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2659; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 2660; GFX940-NEXT: buffer_wbl2 sc1 2661; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 2662; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2663; GFX940-NEXT: buffer_inv sc1 2664; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2665; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2666; GFX940-NEXT: v_mov_b32_e32 v5, v3 2667; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2668; GFX940-NEXT: s_cbranch_execnz .LBB13_1 2669; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2670; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2671; GFX940-NEXT: s_setpc_b64 s[30:31] 2672; 2673; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2674; GFX11: ; %bb.0: 2675; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2676; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 2677; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 2678; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 2679; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 2680; GFX11-NEXT: flat_load_b32 v4, v[3:4] 2681; GFX11-NEXT: s_mov_b32 s0, 0 2682; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start 2683; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2684; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2685; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 2686; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2687; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 2688; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2689; GFX11-NEXT: buffer_gl1_inv 2690; GFX11-NEXT: buffer_gl0_inv 2691; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2692; GFX11-NEXT: v_mov_b32_e32 v4, v3 2693; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2694; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2695; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2696; GFX11-NEXT: s_cbranch_execnz .LBB13_1 2697; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2698; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2699; GFX11-NEXT: s_setpc_b64 s[30:31] 2700; 2701; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2702; GFX10: ; %bb.0: 2703; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2704; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 2705; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 2706; GFX10-NEXT: s_mov_b32 s4, 0 2707; GFX10-NEXT: flat_load_dword v4, v[0:1] 2708; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start 2709; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2710; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2711; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 2712; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2713; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2714; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2715; GFX10-NEXT: buffer_gl1_inv 2716; GFX10-NEXT: buffer_gl0_inv 2717; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2718; GFX10-NEXT: v_mov_b32_e32 v4, v3 2719; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2720; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2721; GFX10-NEXT: s_cbranch_execnz .LBB13_1 2722; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2723; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2724; GFX10-NEXT: s_setpc_b64 s[30:31] 2725; 2726; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2727; GFX90A: ; %bb.0: 2728; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2729; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2730; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2731; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2732; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2733; GFX90A-NEXT: flat_load_dword v1, v[0:1] 2734; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2735; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start 2736; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2737; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2738; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 2739; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 2740; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2741; GFX90A-NEXT: buffer_wbinvl1 2742; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2743; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2744; GFX90A-NEXT: v_mov_b32_e32 v1, v0 2745; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2746; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 2747; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2748; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2749; GFX90A-NEXT: s_setpc_b64 s[30:31] 2750; 2751; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2752; GFX908: ; %bb.0: 2753; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2754; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 2755; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 2756; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2757; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2758; GFX908-NEXT: flat_load_dword v1, v[0:1] 2759; GFX908-NEXT: s_mov_b64 s[4:5], 0 2760; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 2761; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2762; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2763; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 2764; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2765; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2766; GFX908-NEXT: buffer_wbinvl1 2767; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2768; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2769; GFX908-NEXT: v_mov_b32_e32 v1, v0 2770; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2771; GFX908-NEXT: s_cbranch_execnz .LBB13_1 2772; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2773; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2774; GFX908-NEXT: s_setpc_b64 s[30:31] 2775; 2776; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2777; GFX8: ; %bb.0: 2778; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2779; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 2780; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 2781; GFX8-NEXT: flat_load_dword v4, v[0:1] 2782; GFX8-NEXT: s_mov_b64 s[4:5], 0 2783; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 2784; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2785; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2786; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 2787; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2788; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2789; GFX8-NEXT: buffer_wbinvl1 2790; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2791; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2792; GFX8-NEXT: v_mov_b32_e32 v4, v3 2793; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2794; GFX8-NEXT: s_cbranch_execnz .LBB13_1 2795; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2796; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2797; GFX8-NEXT: s_setpc_b64 s[30:31] 2798; 2799; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: 2800; GFX7: ; %bb.0: 2801; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2802; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 2803; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 2804; GFX7-NEXT: flat_load_dword v4, v[0:1] 2805; GFX7-NEXT: s_mov_b64 s[4:5], 0 2806; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start 2807; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2808; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2809; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 2810; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 2811; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2812; GFX7-NEXT: buffer_wbinvl1 2813; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2814; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2815; GFX7-NEXT: v_mov_b32_e32 v4, v3 2816; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2817; GFX7-NEXT: s_cbranch_execnz .LBB13_1 2818; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2819; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2820; GFX7-NEXT: s_setpc_b64 s[30:31] 2821 %gep = getelementptr float, ptr %ptr, i64 -512 2822 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst 2823 ret void 2824} 2825 2826define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 { 2827; GFX12-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2828; GFX12: ; %bb.0: 2829; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2830; GFX12-NEXT: s_wait_expcnt 0x0 2831; GFX12-NEXT: s_wait_samplecnt 0x0 2832; GFX12-NEXT: s_wait_bvhcnt 0x0 2833; GFX12-NEXT: s_wait_kmcnt 0x0 2834; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 2835; GFX12-NEXT: s_mov_b32 s0, 0 2836; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start 2837; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2838; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2839; GFX12-NEXT: v_mov_b32_e32 v4, v3 2840; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2841; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 2842; GFX12-NEXT: global_wb scope:SCOPE_SYS 2843; GFX12-NEXT: s_wait_storecnt 0x0 2844; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2845; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2846; GFX12-NEXT: global_inv scope:SCOPE_SYS 2847; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2848; GFX12-NEXT: s_wait_alu 0xfffe 2849; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 2850; GFX12-NEXT: s_wait_alu 0xfffe 2851; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2852; GFX12-NEXT: s_cbranch_execnz .LBB14_1 2853; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2854; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2855; GFX12-NEXT: v_mov_b32_e32 v0, v3 2856; GFX12-NEXT: s_wait_alu 0xfffe 2857; GFX12-NEXT: s_setpc_b64 s[30:31] 2858; 2859; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2860; GFX940: ; %bb.0: 2861; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2862; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 2863; GFX940-NEXT: s_mov_b64 s[0:1], 0 2864; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start 2865; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2866; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2867; GFX940-NEXT: v_mov_b32_e32 v5, v3 2868; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 2869; GFX940-NEXT: buffer_wbl2 sc0 sc1 2870; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 2871; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2872; GFX940-NEXT: buffer_inv sc0 sc1 2873; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2874; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2875; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2876; GFX940-NEXT: s_cbranch_execnz .LBB14_1 2877; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2878; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2879; GFX940-NEXT: v_mov_b32_e32 v0, v3 2880; GFX940-NEXT: s_setpc_b64 s[30:31] 2881; 2882; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2883; GFX11: ; %bb.0: 2884; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2885; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 2886; GFX11-NEXT: s_mov_b32 s0, 0 2887; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start 2888; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2889; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2890; GFX11-NEXT: v_mov_b32_e32 v4, v3 2891; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2892; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 2893; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2894; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 2895; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2896; GFX11-NEXT: buffer_gl1_inv 2897; GFX11-NEXT: buffer_gl0_inv 2898; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 2899; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 2900; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2901; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 2902; GFX11-NEXT: s_cbranch_execnz .LBB14_1 2903; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2904; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2905; GFX11-NEXT: v_mov_b32_e32 v0, v3 2906; GFX11-NEXT: s_setpc_b64 s[30:31] 2907; 2908; GFX10-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2909; GFX10: ; %bb.0: 2910; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2911; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 2912; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 2913; GFX10-NEXT: s_mov_b32 s4, 0 2914; GFX10-NEXT: flat_load_dword v0, v[3:4] 2915; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start 2916; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2917; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2918; GFX10-NEXT: v_mov_b32_e32 v1, v0 2919; GFX10-NEXT: v_sub_f32_e32 v0, v1, v2 2920; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2921; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2922; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2923; GFX10-NEXT: buffer_gl1_inv 2924; GFX10-NEXT: buffer_gl0_inv 2925; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 2926; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2927; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2928; GFX10-NEXT: s_cbranch_execnz .LBB14_1 2929; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2930; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2931; GFX10-NEXT: s_setpc_b64 s[30:31] 2932; 2933; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2934; GFX90A: ; %bb.0: 2935; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2936; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 2937; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2938; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start 2939; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2940; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2941; GFX90A-NEXT: v_mov_b32_e32 v5, v3 2942; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 2943; GFX90A-NEXT: buffer_wbl2 2944; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 2945; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2946; GFX90A-NEXT: buffer_invl2 2947; GFX90A-NEXT: buffer_wbinvl1 2948; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2949; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2950; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2951; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 2952; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2953; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2954; GFX90A-NEXT: v_mov_b32_e32 v0, v3 2955; GFX90A-NEXT: s_setpc_b64 s[30:31] 2956; 2957; GFX908-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2958; GFX908: ; %bb.0: 2959; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2960; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 2961; GFX908-NEXT: s_mov_b64 s[4:5], 0 2962; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 2963; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2964; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2965; GFX908-NEXT: v_mov_b32_e32 v4, v3 2966; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 2967; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 2968; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2969; GFX908-NEXT: buffer_wbinvl1 2970; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2971; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2972; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2973; GFX908-NEXT: s_cbranch_execnz .LBB14_1 2974; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2975; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2976; GFX908-NEXT: v_mov_b32_e32 v0, v3 2977; GFX908-NEXT: s_setpc_b64 s[30:31] 2978; 2979; GFX8-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 2980; GFX8: ; %bb.0: 2981; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2982; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 2983; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2984; GFX8-NEXT: flat_load_dword v0, v[3:4] 2985; GFX8-NEXT: s_mov_b64 s[4:5], 0 2986; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 2987; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2988; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2989; GFX8-NEXT: v_mov_b32_e32 v1, v0 2990; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 2991; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2992; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2993; GFX8-NEXT: buffer_wbinvl1 2994; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2995; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2996; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2997; GFX8-NEXT: s_cbranch_execnz .LBB14_1 2998; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2999; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3000; GFX8-NEXT: s_setpc_b64 s[30:31] 3001; 3002; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: 3003; GFX7: ; %bb.0: 3004; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3005; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0 3006; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3007; GFX7-NEXT: flat_load_dword v0, v[3:4] 3008; GFX7-NEXT: s_mov_b64 s[4:5], 0 3009; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start 3010; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3011; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3012; GFX7-NEXT: v_mov_b32_e32 v1, v0 3013; GFX7-NEXT: v_sub_f32_e32 v0, v1, v2 3014; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 3015; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3016; GFX7-NEXT: buffer_wbinvl1 3017; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 3018; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3019; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3020; GFX7-NEXT: s_cbranch_execnz .LBB14_1 3021; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3022; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3023; GFX7-NEXT: s_setpc_b64 s[30:31] 3024 %gep = getelementptr float, ptr %ptr, i64 511 3025 %result = atomicrmw fsub ptr %gep, float %val seq_cst 3026 ret float %result 3027} 3028 3029define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 { 3030; GFX12-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3031; GFX12: ; %bb.0: 3032; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3033; GFX12-NEXT: s_wait_expcnt 0x0 3034; GFX12-NEXT: s_wait_samplecnt 0x0 3035; GFX12-NEXT: s_wait_bvhcnt 0x0 3036; GFX12-NEXT: s_wait_kmcnt 0x0 3037; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 3038; GFX12-NEXT: s_mov_b32 s0, 0 3039; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start 3040; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3041; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3042; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 3043; GFX12-NEXT: global_wb scope:SCOPE_SYS 3044; GFX12-NEXT: s_wait_storecnt 0x0 3045; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 3046; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3047; GFX12-NEXT: global_inv scope:SCOPE_SYS 3048; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3049; GFX12-NEXT: v_mov_b32_e32 v4, v3 3050; GFX12-NEXT: s_wait_alu 0xfffe 3051; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 3052; GFX12-NEXT: s_wait_alu 0xfffe 3053; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 3054; GFX12-NEXT: s_cbranch_execnz .LBB15_1 3055; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3056; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3057; GFX12-NEXT: s_wait_alu 0xfffe 3058; GFX12-NEXT: s_setpc_b64 s[30:31] 3059; 3060; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3061; GFX940: ; %bb.0: 3062; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3063; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 3064; GFX940-NEXT: s_mov_b64 s[0:1], 0 3065; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start 3066; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3067; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3068; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 3069; GFX940-NEXT: buffer_wbl2 sc0 sc1 3070; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 3071; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3072; GFX940-NEXT: buffer_inv sc0 sc1 3073; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 3074; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 3075; GFX940-NEXT: v_mov_b32_e32 v5, v3 3076; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 3077; GFX940-NEXT: s_cbranch_execnz .LBB15_1 3078; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 3079; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3080; GFX940-NEXT: s_setpc_b64 s[30:31] 3081; 3082; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3083; GFX11: ; %bb.0: 3084; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3085; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 3086; GFX11-NEXT: s_mov_b32 s0, 0 3087; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start 3088; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3089; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3090; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 3091; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3092; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 3093; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3094; GFX11-NEXT: buffer_gl1_inv 3095; GFX11-NEXT: buffer_gl0_inv 3096; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3097; GFX11-NEXT: v_mov_b32_e32 v4, v3 3098; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 3099; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3100; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 3101; GFX11-NEXT: s_cbranch_execnz .LBB15_1 3102; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 3103; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3104; GFX11-NEXT: s_setpc_b64 s[30:31] 3105; 3106; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3107; GFX10: ; %bb.0: 3108; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3109; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 3110; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 3111; GFX10-NEXT: s_mov_b32 s4, 0 3112; GFX10-NEXT: flat_load_dword v4, v[0:1] 3113; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start 3114; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3115; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3116; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 3117; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3118; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3119; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3120; GFX10-NEXT: buffer_gl1_inv 3121; GFX10-NEXT: buffer_gl0_inv 3122; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 3123; GFX10-NEXT: v_mov_b32_e32 v4, v3 3124; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 3125; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 3126; GFX10-NEXT: s_cbranch_execnz .LBB15_1 3127; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3128; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3129; GFX10-NEXT: s_setpc_b64 s[30:31] 3130; 3131; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3132; GFX90A: ; %bb.0: 3133; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3134; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 3135; GFX90A-NEXT: s_mov_b64 s[4:5], 0 3136; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start 3137; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3138; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3139; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 3140; GFX90A-NEXT: buffer_wbl2 3141; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 3142; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3143; GFX90A-NEXT: buffer_invl2 3144; GFX90A-NEXT: buffer_wbinvl1 3145; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 3146; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3147; GFX90A-NEXT: v_mov_b32_e32 v5, v3 3148; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 3149; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 3150; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 3151; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3152; GFX90A-NEXT: s_setpc_b64 s[30:31] 3153; 3154; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3155; GFX908: ; %bb.0: 3156; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3157; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 3158; GFX908-NEXT: s_mov_b64 s[4:5], 0 3159; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start 3160; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3161; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3162; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 3163; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 3164; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3165; GFX908-NEXT: buffer_wbinvl1 3166; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3167; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3168; GFX908-NEXT: v_mov_b32_e32 v4, v3 3169; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3170; GFX908-NEXT: s_cbranch_execnz .LBB15_1 3171; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3172; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3173; GFX908-NEXT: s_setpc_b64 s[30:31] 3174; 3175; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3176; GFX8: ; %bb.0: 3177; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3178; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 3179; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3180; GFX8-NEXT: flat_load_dword v4, v[0:1] 3181; GFX8-NEXT: s_mov_b64 s[4:5], 0 3182; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start 3183; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3184; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3185; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 3186; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3187; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3188; GFX8-NEXT: buffer_wbinvl1 3189; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3190; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3191; GFX8-NEXT: v_mov_b32_e32 v4, v3 3192; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3193; GFX8-NEXT: s_cbranch_execnz .LBB15_1 3194; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3195; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3196; GFX8-NEXT: s_setpc_b64 s[30:31] 3197; 3198; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: 3199; GFX7: ; %bb.0: 3200; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3201; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 3202; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3203; GFX7-NEXT: flat_load_dword v4, v[0:1] 3204; GFX7-NEXT: s_mov_b64 s[4:5], 0 3205; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start 3206; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3207; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3208; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 3209; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 3210; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3211; GFX7-NEXT: buffer_wbinvl1 3212; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 3213; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3214; GFX7-NEXT: v_mov_b32_e32 v4, v3 3215; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3216; GFX7-NEXT: s_cbranch_execnz .LBB15_1 3217; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3218; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3219; GFX7-NEXT: s_setpc_b64 s[30:31] 3220 %gep = getelementptr float, ptr %ptr, i64 511 3221 %unused = atomicrmw fsub ptr %gep, float %val seq_cst 3222 ret void 3223} 3224 3225; -------------------------------------------------------------------- 3226; double 3227; -------------------------------------------------------------------- 3228 3229define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { 3230; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64: 3231; GFX12: ; %bb.0: 3232; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3233; GFX12-NEXT: s_wait_expcnt 0x0 3234; GFX12-NEXT: s_wait_samplecnt 0x0 3235; GFX12-NEXT: s_wait_bvhcnt 0x0 3236; GFX12-NEXT: s_wait_kmcnt 0x0 3237; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 3238; GFX12-NEXT: s_mov_b32 s0, exec_lo 3239; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 3240; GFX12-NEXT: s_wait_alu 0xfffe 3241; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 3242; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 3243; GFX12-NEXT: s_cbranch_execz .LBB16_4 3244; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global 3245; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] 3246; GFX12-NEXT: s_mov_b32 s1, 0 3247; GFX12-NEXT: .LBB16_2: ; %atomicrmw.start 3248; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3249; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3250; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 3251; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3252; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] 3253; GFX12-NEXT: s_wait_storecnt 0x0 3254; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3255; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3256; GFX12-NEXT: global_inv scope:SCOPE_DEV 3257; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 3258; GFX12-NEXT: s_wait_alu 0xfffe 3259; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 3260; GFX12-NEXT: s_wait_alu 0xfffe 3261; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3262; GFX12-NEXT: s_cbranch_execnz .LBB16_2 3263; GFX12-NEXT: ; %bb.3: ; %Flow 3264; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 3265; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 3266; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 3267; GFX12-NEXT: .LBB16_4: ; %Flow3 3268; GFX12-NEXT: s_wait_alu 0xfffe 3269; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3270; GFX12-NEXT: s_cbranch_execz .LBB16_6 3271; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private 3272; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 3273; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 3274; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off 3275; GFX12-NEXT: s_wait_loadcnt 0x0 3276; GFX12-NEXT: v_add_f64_e64 v[0:1], v[4:5], -v[2:3] 3277; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off 3278; GFX12-NEXT: .LBB16_6: ; %atomicrmw.phi 3279; GFX12-NEXT: s_wait_alu 0xfffe 3280; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3281; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 3282; GFX12-NEXT: s_wait_alu 0xfffe 3283; GFX12-NEXT: s_setpc_b64 s[30:31] 3284; 3285; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: 3286; GFX940: ; %bb.0: 3287; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3288; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 3289; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 3290; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 3291; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 3292; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3293; GFX940-NEXT: s_cbranch_execz .LBB16_4 3294; GFX940-NEXT: ; %bb.1: ; %atomicrmw.global 3295; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 3296; GFX940-NEXT: s_mov_b64 s[2:3], 0 3297; GFX940-NEXT: .LBB16_2: ; %atomicrmw.start 3298; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3299; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3300; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] 3301; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3302; GFX940-NEXT: buffer_wbl2 sc1 3303; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 3304; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3305; GFX940-NEXT: buffer_inv sc1 3306; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3307; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 3308; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 3309; GFX940-NEXT: s_cbranch_execnz .LBB16_2 3310; GFX940-NEXT: ; %bb.3: ; %Flow 3311; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 3312; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 3313; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 3314; GFX940-NEXT: .LBB16_4: ; %Flow3 3315; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3316; GFX940-NEXT: s_cbranch_execz .LBB16_6 3317; GFX940-NEXT: ; %bb.5: ; %atomicrmw.private 3318; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3319; GFX940-NEXT: s_nop 1 3320; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3321; GFX940-NEXT: scratch_load_dwordx2 v[4:5], v6, off 3322; GFX940-NEXT: s_waitcnt vmcnt(0) 3323; GFX940-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3324; GFX940-NEXT: scratch_store_dwordx2 v6, v[0:1], off sc0 sc1 3325; GFX940-NEXT: .LBB16_6: ; %atomicrmw.phi 3326; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3327; GFX940-NEXT: v_mov_b32_e32 v0, v4 3328; GFX940-NEXT: v_mov_b32_e32 v1, v5 3329; GFX940-NEXT: s_waitcnt vmcnt(0) 3330; GFX940-NEXT: s_setpc_b64 s[30:31] 3331; 3332; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64: 3333; GFX11: ; %bb.0: 3334; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3335; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 3336; GFX11-NEXT: s_mov_b32 s0, exec_lo 3337; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 3338; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 3339; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 3340; GFX11-NEXT: s_cbranch_execz .LBB16_4 3341; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global 3342; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] 3343; GFX11-NEXT: s_mov_b32 s1, 0 3344; GFX11-NEXT: .LBB16_2: ; %atomicrmw.start 3345; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3346; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3347; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 3348; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3349; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3350; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3351; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 3352; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3353; GFX11-NEXT: buffer_gl1_inv 3354; GFX11-NEXT: buffer_gl0_inv 3355; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 3356; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 3357; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3358; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3359; GFX11-NEXT: s_cbranch_execnz .LBB16_2 3360; GFX11-NEXT: ; %bb.3: ; %Flow 3361; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 3362; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 3363; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 3364; GFX11-NEXT: .LBB16_4: ; %Flow3 3365; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3366; GFX11-NEXT: s_cbranch_execz .LBB16_6 3367; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private 3368; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 3369; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 3370; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off 3371; GFX11-NEXT: s_waitcnt vmcnt(0) 3372; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3373; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off 3374; GFX11-NEXT: .LBB16_6: ; %atomicrmw.phi 3375; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3376; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 3377; GFX11-NEXT: s_setpc_b64 s[30:31] 3378; 3379; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64: 3380; GFX10: ; %bb.0: 3381; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3382; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 3383; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 3384; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 3385; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 3386; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 3387; GFX10-NEXT: s_cbranch_execz .LBB16_4 3388; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global 3389; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 3390; GFX10-NEXT: s_mov_b32 s5, 0 3391; GFX10-NEXT: .LBB16_2: ; %atomicrmw.start 3392; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3393; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3394; GFX10-NEXT: v_mov_b32_e32 v7, v5 3395; GFX10-NEXT: v_mov_b32_e32 v6, v4 3396; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3397; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3398; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3399; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3400; GFX10-NEXT: buffer_gl1_inv 3401; GFX10-NEXT: buffer_gl0_inv 3402; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 3403; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3404; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3405; GFX10-NEXT: s_cbranch_execnz .LBB16_2 3406; GFX10-NEXT: ; %bb.3: ; %Flow 3407; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3408; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 3409; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 3410; GFX10-NEXT: .LBB16_4: ; %Flow3 3411; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3412; GFX10-NEXT: s_cbranch_execz .LBB16_6 3413; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private 3414; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 3415; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 3416; GFX10-NEXT: s_clause 0x1 3417; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 3418; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 3419; GFX10-NEXT: s_waitcnt vmcnt(0) 3420; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3421; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3422; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 3423; GFX10-NEXT: .LBB16_6: ; %atomicrmw.phi 3424; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3425; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3426; GFX10-NEXT: v_mov_b32_e32 v0, v4 3427; GFX10-NEXT: v_mov_b32_e32 v1, v5 3428; GFX10-NEXT: s_setpc_b64 s[30:31] 3429; 3430; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: 3431; GFX90A: ; %bb.0: 3432; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3433; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 3434; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3435; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3436; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3437; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3438; GFX90A-NEXT: s_cbranch_execz .LBB16_4 3439; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global 3440; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 3441; GFX90A-NEXT: s_mov_b64 s[6:7], 0 3442; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start 3443; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3444; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3445; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] 3446; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3447; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3448; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3449; GFX90A-NEXT: buffer_wbinvl1 3450; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3451; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3452; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 3453; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 3454; GFX90A-NEXT: ; %bb.3: ; %Flow 3455; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3456; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3457; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 3458; GFX90A-NEXT: .LBB16_4: ; %Flow3 3459; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3460; GFX90A-NEXT: s_cbranch_execz .LBB16_6 3461; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private 3462; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3463; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3464; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 3465; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 3466; GFX90A-NEXT: s_waitcnt vmcnt(0) 3467; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3468; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3469; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 3470; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi 3471; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3472; GFX90A-NEXT: v_mov_b32_e32 v0, v4 3473; GFX90A-NEXT: v_mov_b32_e32 v1, v5 3474; GFX90A-NEXT: s_waitcnt vmcnt(0) 3475; GFX90A-NEXT: s_setpc_b64 s[30:31] 3476; 3477; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64: 3478; GFX908: ; %bb.0: 3479; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3480; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 3481; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3482; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 3483; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3484; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3485; GFX908-NEXT: s_cbranch_execz .LBB16_4 3486; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global 3487; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 3488; GFX908-NEXT: s_mov_b64 s[6:7], 0 3489; GFX908-NEXT: .LBB16_2: ; %atomicrmw.start 3490; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3491; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3492; GFX908-NEXT: v_mov_b32_e32 v7, v5 3493; GFX908-NEXT: v_mov_b32_e32 v6, v4 3494; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3495; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3496; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3497; GFX908-NEXT: buffer_wbinvl1 3498; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3499; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3500; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3501; GFX908-NEXT: s_cbranch_execnz .LBB16_2 3502; GFX908-NEXT: ; %bb.3: ; %Flow 3503; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3504; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3505; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 3506; GFX908-NEXT: .LBB16_4: ; %Flow3 3507; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3508; GFX908-NEXT: s_cbranch_execz .LBB16_6 3509; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private 3510; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3511; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3512; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 3513; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 3514; GFX908-NEXT: s_waitcnt vmcnt(0) 3515; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3516; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3517; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 3518; GFX908-NEXT: .LBB16_6: ; %atomicrmw.phi 3519; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3520; GFX908-NEXT: v_mov_b32_e32 v0, v4 3521; GFX908-NEXT: v_mov_b32_e32 v1, v5 3522; GFX908-NEXT: s_waitcnt vmcnt(0) 3523; GFX908-NEXT: s_setpc_b64 s[30:31] 3524; 3525; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64: 3526; GFX8: ; %bb.0: 3527; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3528; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 3529; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 3530; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 3531; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3532; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 3533; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3534; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3535; GFX8-NEXT: s_cbranch_execz .LBB16_4 3536; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global 3537; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 3538; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3539; GFX8-NEXT: flat_load_dword v5, v[4:5] 3540; GFX8-NEXT: flat_load_dword v4, v[0:1] 3541; GFX8-NEXT: s_mov_b64 s[6:7], 0 3542; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start 3543; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3544; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3545; GFX8-NEXT: v_mov_b32_e32 v7, v5 3546; GFX8-NEXT: v_mov_b32_e32 v6, v4 3547; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3548; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3549; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3550; GFX8-NEXT: buffer_wbinvl1 3551; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3552; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3553; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3554; GFX8-NEXT: s_cbranch_execnz .LBB16_2 3555; GFX8-NEXT: ; %bb.3: ; %Flow 3556; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 3557; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3558; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 3559; GFX8-NEXT: .LBB16_4: ; %Flow3 3560; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3561; GFX8-NEXT: s_cbranch_execz .LBB16_6 3562; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private 3563; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3564; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3565; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 3566; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 3567; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen 3568; GFX8-NEXT: s_waitcnt vmcnt(0) 3569; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3570; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3571; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 3572; GFX8-NEXT: .LBB16_6: ; %atomicrmw.phi 3573; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3574; GFX8-NEXT: v_mov_b32_e32 v0, v4 3575; GFX8-NEXT: v_mov_b32_e32 v1, v5 3576; GFX8-NEXT: s_waitcnt vmcnt(0) 3577; GFX8-NEXT: s_setpc_b64 s[30:31] 3578; 3579; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64: 3580; GFX7: ; %bb.0: 3581; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3582; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 3583; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 3584; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 3585; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3586; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 3587; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 3588; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3589; GFX7-NEXT: s_cbranch_execz .LBB16_4 3590; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global 3591; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 3592; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3593; GFX7-NEXT: flat_load_dword v5, v[4:5] 3594; GFX7-NEXT: flat_load_dword v4, v[0:1] 3595; GFX7-NEXT: s_mov_b64 s[6:7], 0 3596; GFX7-NEXT: .LBB16_2: ; %atomicrmw.start 3597; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3598; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3599; GFX7-NEXT: v_mov_b32_e32 v7, v5 3600; GFX7-NEXT: v_mov_b32_e32 v6, v4 3601; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 3602; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 3603; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3604; GFX7-NEXT: buffer_wbinvl1 3605; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 3606; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3607; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 3608; GFX7-NEXT: s_cbranch_execnz .LBB16_2 3609; GFX7-NEXT: ; %bb.3: ; %Flow 3610; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 3611; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 3612; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 3613; GFX7-NEXT: .LBB16_4: ; %Flow3 3614; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3615; GFX7-NEXT: s_cbranch_execz .LBB16_6 3616; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private 3617; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3618; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3619; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 3620; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen 3621; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen 3622; GFX7-NEXT: s_waitcnt vmcnt(0) 3623; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] 3624; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3625; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 3626; GFX7-NEXT: .LBB16_6: ; %atomicrmw.phi 3627; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3628; GFX7-NEXT: v_mov_b32_e32 v0, v4 3629; GFX7-NEXT: v_mov_b32_e32 v1, v5 3630; GFX7-NEXT: s_waitcnt vmcnt(0) 3631; GFX7-NEXT: s_setpc_b64 s[30:31] 3632 %result = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst 3633 ret double %result 3634} 3635 3636define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 { 3637; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3638; GFX12: ; %bb.0: 3639; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3640; GFX12-NEXT: s_wait_expcnt 0x0 3641; GFX12-NEXT: s_wait_samplecnt 0x0 3642; GFX12-NEXT: s_wait_bvhcnt 0x0 3643; GFX12-NEXT: s_wait_kmcnt 0x0 3644; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3645; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3646; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 3647; GFX12-NEXT: s_mov_b32 s0, exec_lo 3648; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 3649; GFX12-NEXT: s_wait_alu 0xfffe 3650; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3651; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 3652; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 3653; GFX12-NEXT: s_cbranch_execnz .LBB17_3 3654; GFX12-NEXT: ; %bb.1: ; %Flow3 3655; GFX12-NEXT: s_wait_alu 0xfffe 3656; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3657; GFX12-NEXT: s_cbranch_execnz .LBB17_6 3658; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi 3659; GFX12-NEXT: s_wait_alu 0xfffe 3660; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3661; GFX12-NEXT: s_wait_alu 0xfffe 3662; GFX12-NEXT: s_setpc_b64 s[30:31] 3663; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global 3664; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 3665; GFX12-NEXT: s_mov_b32 s1, 0 3666; GFX12-NEXT: .LBB17_4: ; %atomicrmw.start 3667; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3668; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3669; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3670; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3671; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] 3672; GFX12-NEXT: s_wait_storecnt 0x0 3673; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3674; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3675; GFX12-NEXT: global_inv scope:SCOPE_DEV 3676; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3677; GFX12-NEXT: s_wait_alu 0xfffe 3678; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 3679; GFX12-NEXT: s_wait_alu 0xfffe 3680; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3681; GFX12-NEXT: s_cbranch_execnz .LBB17_4 3682; GFX12-NEXT: ; %bb.5: ; %Flow 3683; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 3684; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 3685; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 3686; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3687; GFX12-NEXT: s_cbranch_execz .LBB17_2 3688; GFX12-NEXT: .LBB17_6: ; %atomicrmw.private 3689; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3690; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 3691; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 3692; GFX12-NEXT: s_wait_loadcnt 0x0 3693; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] 3694; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off 3695; GFX12-NEXT: s_wait_alu 0xfffe 3696; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3697; GFX12-NEXT: s_wait_alu 0xfffe 3698; GFX12-NEXT: s_setpc_b64 s[30:31] 3699; 3700; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3701; GFX940: ; %bb.0: 3702; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3703; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 3704; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 3705; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 3706; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 3707; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 3708; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 3709; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3710; GFX940-NEXT: s_cbranch_execnz .LBB17_3 3711; GFX940-NEXT: ; %bb.1: ; %Flow3 3712; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3713; GFX940-NEXT: s_cbranch_execnz .LBB17_6 3714; GFX940-NEXT: .LBB17_2: ; %atomicrmw.phi 3715; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3716; GFX940-NEXT: s_setpc_b64 s[30:31] 3717; GFX940-NEXT: .LBB17_3: ; %atomicrmw.global 3718; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3719; GFX940-NEXT: s_mov_b64 s[2:3], 0 3720; GFX940-NEXT: .LBB17_4: ; %atomicrmw.start 3721; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3722; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3723; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] 3724; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3725; GFX940-NEXT: buffer_wbl2 sc1 3726; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 3727; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3728; GFX940-NEXT: buffer_inv sc1 3729; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3730; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 3731; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 3732; GFX940-NEXT: s_cbranch_execnz .LBB17_4 3733; GFX940-NEXT: ; %bb.5: ; %Flow 3734; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 3735; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 3736; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 3737; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3738; GFX940-NEXT: s_cbranch_execz .LBB17_2 3739; GFX940-NEXT: .LBB17_6: ; %atomicrmw.private 3740; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3741; GFX940-NEXT: s_nop 1 3742; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 3743; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 3744; GFX940-NEXT: s_waitcnt vmcnt(0) 3745; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 3746; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 3747; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3748; GFX940-NEXT: s_waitcnt vmcnt(0) 3749; GFX940-NEXT: s_setpc_b64 s[30:31] 3750; 3751; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3752; GFX11: ; %bb.0: 3753; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3754; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3755; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3756; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 3757; GFX11-NEXT: s_mov_b32 s0, exec_lo 3758; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 3759; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3760; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 3761; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 3762; GFX11-NEXT: s_cbranch_execnz .LBB17_3 3763; GFX11-NEXT: ; %bb.1: ; %Flow3 3764; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3765; GFX11-NEXT: s_cbranch_execnz .LBB17_6 3766; GFX11-NEXT: .LBB17_2: ; %atomicrmw.phi 3767; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3768; GFX11-NEXT: s_setpc_b64 s[30:31] 3769; GFX11-NEXT: .LBB17_3: ; %atomicrmw.global 3770; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 3771; GFX11-NEXT: s_mov_b32 s1, 0 3772; GFX11-NEXT: .LBB17_4: ; %atomicrmw.start 3773; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3774; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3775; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3776; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3777; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3778; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3779; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 3780; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3781; GFX11-NEXT: buffer_gl1_inv 3782; GFX11-NEXT: buffer_gl0_inv 3783; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3784; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 3785; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3786; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3787; GFX11-NEXT: s_cbranch_execnz .LBB17_4 3788; GFX11-NEXT: ; %bb.5: ; %Flow 3789; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 3790; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 3791; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 3792; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3793; GFX11-NEXT: s_cbranch_execz .LBB17_2 3794; GFX11-NEXT: .LBB17_6: ; %atomicrmw.private 3795; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3796; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 3797; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 3798; GFX11-NEXT: s_waitcnt vmcnt(0) 3799; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 3800; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off 3801; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3802; GFX11-NEXT: s_setpc_b64 s[30:31] 3803; 3804; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3805; GFX10: ; %bb.0: 3806; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3807; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3808; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3809; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 3810; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 3811; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 3812; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 3813; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 3814; GFX10-NEXT: s_cbranch_execnz .LBB17_3 3815; GFX10-NEXT: ; %bb.1: ; %Flow3 3816; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3817; GFX10-NEXT: s_cbranch_execnz .LBB17_6 3818; GFX10-NEXT: .LBB17_2: ; %atomicrmw.phi 3819; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3820; GFX10-NEXT: s_setpc_b64 s[30:31] 3821; GFX10-NEXT: .LBB17_3: ; %atomicrmw.global 3822; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3823; GFX10-NEXT: s_mov_b32 s5, 0 3824; GFX10-NEXT: .LBB17_4: ; %atomicrmw.start 3825; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3826; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3827; GFX10-NEXT: v_mov_b32_e32 v9, v1 3828; GFX10-NEXT: v_mov_b32_e32 v8, v0 3829; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3830; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3831; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3832; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3833; GFX10-NEXT: buffer_gl1_inv 3834; GFX10-NEXT: buffer_gl0_inv 3835; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3836; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3837; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3838; GFX10-NEXT: s_cbranch_execnz .LBB17_4 3839; GFX10-NEXT: ; %bb.5: ; %Flow 3840; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3841; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 3842; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 3843; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3844; GFX10-NEXT: s_cbranch_execz .LBB17_2 3845; GFX10-NEXT: .LBB17_6: ; %atomicrmw.private 3846; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3847; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 3848; GFX10-NEXT: s_clause 0x1 3849; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 3850; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 3851; GFX10-NEXT: s_waitcnt vmcnt(0) 3852; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 3853; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 3854; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 3855; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3856; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3857; GFX10-NEXT: s_setpc_b64 s[30:31] 3858; 3859; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3860; GFX90A: ; %bb.0: 3861; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3862; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 3863; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3864; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 3865; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3866; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3867; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3868; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3869; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 3870; GFX90A-NEXT: ; %bb.1: ; %Flow3 3871; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3872; GFX90A-NEXT: s_cbranch_execnz .LBB17_6 3873; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi 3874; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3875; GFX90A-NEXT: s_setpc_b64 s[30:31] 3876; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global 3877; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3878; GFX90A-NEXT: s_mov_b64 s[6:7], 0 3879; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start 3880; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3881; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3882; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] 3883; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3884; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3885; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3886; GFX90A-NEXT: buffer_wbinvl1 3887; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3888; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3889; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 3890; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 3891; GFX90A-NEXT: ; %bb.5: ; %Flow 3892; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3893; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3894; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 3895; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3896; GFX90A-NEXT: s_cbranch_execz .LBB17_2 3897; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private 3898; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3899; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 3900; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 3901; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 3902; GFX90A-NEXT: s_waitcnt vmcnt(0) 3903; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 3904; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 3905; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 3906; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3907; GFX90A-NEXT: s_waitcnt vmcnt(0) 3908; GFX90A-NEXT: s_setpc_b64 s[30:31] 3909; 3910; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3911; GFX908: ; %bb.0: 3912; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3913; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 3914; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3915; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 3916; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3917; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3918; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3919; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3920; GFX908-NEXT: s_cbranch_execnz .LBB17_3 3921; GFX908-NEXT: ; %bb.1: ; %Flow3 3922; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3923; GFX908-NEXT: s_cbranch_execnz .LBB17_6 3924; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi 3925; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3926; GFX908-NEXT: s_setpc_b64 s[30:31] 3927; GFX908-NEXT: .LBB17_3: ; %atomicrmw.global 3928; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3929; GFX908-NEXT: s_mov_b64 s[6:7], 0 3930; GFX908-NEXT: .LBB17_4: ; %atomicrmw.start 3931; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3932; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3933; GFX908-NEXT: v_mov_b32_e32 v9, v1 3934; GFX908-NEXT: v_mov_b32_e32 v8, v0 3935; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3936; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3937; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3938; GFX908-NEXT: buffer_wbinvl1 3939; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3940; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3941; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3942; GFX908-NEXT: s_cbranch_execnz .LBB17_4 3943; GFX908-NEXT: ; %bb.5: ; %Flow 3944; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3945; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 3946; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 3947; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3948; GFX908-NEXT: s_cbranch_execz .LBB17_2 3949; GFX908-NEXT: .LBB17_6: ; %atomicrmw.private 3950; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3951; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 3952; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 3953; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 3954; GFX908-NEXT: s_waitcnt vmcnt(0) 3955; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 3956; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 3957; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 3958; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3959; GFX908-NEXT: s_waitcnt vmcnt(0) 3960; GFX908-NEXT: s_setpc_b64 s[30:31] 3961; 3962; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 3963; GFX8: ; %bb.0: 3964; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3965; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 3966; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 3967; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 3968; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3969; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3970; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3971; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3972; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3973; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3974; GFX8-NEXT: s_cbranch_execnz .LBB17_3 3975; GFX8-NEXT: ; %bb.1: ; %Flow3 3976; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3977; GFX8-NEXT: s_cbranch_execnz .LBB17_6 3978; GFX8-NEXT: .LBB17_2: ; %atomicrmw.phi 3979; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3980; GFX8-NEXT: s_setpc_b64 s[30:31] 3981; GFX8-NEXT: .LBB17_3: ; %atomicrmw.global 3982; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 3983; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 3984; GFX8-NEXT: flat_load_dword v1, v[0:1] 3985; GFX8-NEXT: flat_load_dword v0, v[4:5] 3986; GFX8-NEXT: s_mov_b64 s[6:7], 0 3987; GFX8-NEXT: .LBB17_4: ; %atomicrmw.start 3988; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3989; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3990; GFX8-NEXT: v_mov_b32_e32 v9, v1 3991; GFX8-NEXT: v_mov_b32_e32 v8, v0 3992; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 3993; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3994; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3995; GFX8-NEXT: buffer_wbinvl1 3996; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3997; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3998; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3999; GFX8-NEXT: s_cbranch_execnz .LBB17_4 4000; GFX8-NEXT: ; %bb.5: ; %Flow 4001; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4002; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 4003; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 4004; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4005; GFX8-NEXT: s_cbranch_execz .LBB17_2 4006; GFX8-NEXT: .LBB17_6: ; %atomicrmw.private 4007; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4008; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4009; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 4010; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4011; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4012; GFX8-NEXT: s_waitcnt vmcnt(0) 4013; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4014; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4015; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 4016; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4017; GFX8-NEXT: s_waitcnt vmcnt(0) 4018; GFX8-NEXT: s_setpc_b64 s[30:31] 4019; 4020; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: 4021; GFX7: ; %bb.0: 4022; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4023; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 4024; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 4025; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 4026; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 4027; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4028; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 4029; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 4030; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 4031; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4032; GFX7-NEXT: s_cbranch_execnz .LBB17_3 4033; GFX7-NEXT: ; %bb.1: ; %Flow3 4034; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4035; GFX7-NEXT: s_cbranch_execnz .LBB17_6 4036; GFX7-NEXT: .LBB17_2: ; %atomicrmw.phi 4037; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4038; GFX7-NEXT: s_setpc_b64 s[30:31] 4039; GFX7-NEXT: .LBB17_3: ; %atomicrmw.global 4040; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 4041; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 4042; GFX7-NEXT: flat_load_dword v1, v[0:1] 4043; GFX7-NEXT: flat_load_dword v0, v[4:5] 4044; GFX7-NEXT: s_mov_b64 s[6:7], 0 4045; GFX7-NEXT: .LBB17_4: ; %atomicrmw.start 4046; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4047; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4048; GFX7-NEXT: v_mov_b32_e32 v9, v1 4049; GFX7-NEXT: v_mov_b32_e32 v8, v0 4050; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4051; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4052; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4053; GFX7-NEXT: buffer_wbinvl1 4054; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4055; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4056; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 4057; GFX7-NEXT: s_cbranch_execnz .LBB17_4 4058; GFX7-NEXT: ; %bb.5: ; %Flow 4059; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 4060; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 4061; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 4062; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4063; GFX7-NEXT: s_cbranch_execz .LBB17_2 4064; GFX7-NEXT: .LBB17_6: ; %atomicrmw.private 4065; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4066; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4067; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 4068; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4069; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4070; GFX7-NEXT: s_waitcnt vmcnt(0) 4071; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4072; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4073; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 4074; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4075; GFX7-NEXT: s_waitcnt vmcnt(0) 4076; GFX7-NEXT: s_setpc_b64 s[30:31] 4077 %gep = getelementptr double, ptr %ptr, i64 255 4078 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst 4079 ret double %result 4080} 4081 4082define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 { 4083; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4084; GFX12: ; %bb.0: 4085; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4086; GFX12-NEXT: s_wait_expcnt 0x0 4087; GFX12-NEXT: s_wait_samplecnt 0x0 4088; GFX12-NEXT: s_wait_bvhcnt 0x0 4089; GFX12-NEXT: s_wait_kmcnt 0x0 4090; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 4091; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 4092; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 4093; GFX12-NEXT: s_mov_b32 s0, exec_lo 4094; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 4095; GFX12-NEXT: s_wait_alu 0xfffe 4096; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4097; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 4098; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 4099; GFX12-NEXT: s_cbranch_execnz .LBB18_3 4100; GFX12-NEXT: ; %bb.1: ; %Flow3 4101; GFX12-NEXT: s_wait_alu 0xfffe 4102; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4103; GFX12-NEXT: s_cbranch_execnz .LBB18_6 4104; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi 4105; GFX12-NEXT: s_wait_alu 0xfffe 4106; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4107; GFX12-NEXT: s_wait_alu 0xfffe 4108; GFX12-NEXT: s_setpc_b64 s[30:31] 4109; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global 4110; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 4111; GFX12-NEXT: s_mov_b32 s1, 0 4112; GFX12-NEXT: .LBB18_4: ; %atomicrmw.start 4113; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4114; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4115; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 4116; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4117; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] 4118; GFX12-NEXT: s_wait_storecnt 0x0 4119; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4120; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4121; GFX12-NEXT: global_inv scope:SCOPE_DEV 4122; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 4123; GFX12-NEXT: s_wait_alu 0xfffe 4124; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4125; GFX12-NEXT: s_wait_alu 0xfffe 4126; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4127; GFX12-NEXT: s_cbranch_execnz .LBB18_4 4128; GFX12-NEXT: ; %bb.5: ; %Flow 4129; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4130; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 4131; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 4132; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4133; GFX12-NEXT: s_cbranch_execz .LBB18_2 4134; GFX12-NEXT: .LBB18_6: ; %atomicrmw.private 4135; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 4136; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 4137; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 4138; GFX12-NEXT: s_wait_loadcnt 0x0 4139; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] 4140; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off 4141; GFX12-NEXT: s_wait_alu 0xfffe 4142; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4143; GFX12-NEXT: s_wait_alu 0xfffe 4144; GFX12-NEXT: s_setpc_b64 s[30:31] 4145; 4146; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4147; GFX940: ; %bb.0: 4148; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4149; GFX940-NEXT: s_movk_i32 s0, 0xf800 4150; GFX940-NEXT: s_mov_b32 s1, -1 4151; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 4152; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 4153; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 4154; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 4155; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 4156; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4157; GFX940-NEXT: s_cbranch_execnz .LBB18_3 4158; GFX940-NEXT: ; %bb.1: ; %Flow3 4159; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4160; GFX940-NEXT: s_cbranch_execnz .LBB18_6 4161; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi 4162; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4163; GFX940-NEXT: s_setpc_b64 s[30:31] 4164; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global 4165; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 4166; GFX940-NEXT: s_mov_b64 s[2:3], 0 4167; GFX940-NEXT: .LBB18_4: ; %atomicrmw.start 4168; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 4169; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4170; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] 4171; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4172; GFX940-NEXT: buffer_wbl2 sc1 4173; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 4174; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4175; GFX940-NEXT: buffer_inv sc1 4176; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4177; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 4178; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 4179; GFX940-NEXT: s_cbranch_execnz .LBB18_4 4180; GFX940-NEXT: ; %bb.5: ; %Flow 4181; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 4182; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 4183; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 4184; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4185; GFX940-NEXT: s_cbranch_execz .LBB18_2 4186; GFX940-NEXT: .LBB18_6: ; %atomicrmw.private 4187; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4188; GFX940-NEXT: s_nop 1 4189; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4190; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 4191; GFX940-NEXT: s_waitcnt vmcnt(0) 4192; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4193; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 4194; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4195; GFX940-NEXT: s_waitcnt vmcnt(0) 4196; GFX940-NEXT: s_setpc_b64 s[30:31] 4197; 4198; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4199; GFX11: ; %bb.0: 4200; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4201; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 4202; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 4203; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 4204; GFX11-NEXT: s_mov_b32 s0, exec_lo 4205; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 4206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4207; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 4208; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 4209; GFX11-NEXT: s_cbranch_execnz .LBB18_3 4210; GFX11-NEXT: ; %bb.1: ; %Flow3 4211; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4212; GFX11-NEXT: s_cbranch_execnz .LBB18_6 4213; GFX11-NEXT: .LBB18_2: ; %atomicrmw.phi 4214; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4215; GFX11-NEXT: s_setpc_b64 s[30:31] 4216; GFX11-NEXT: .LBB18_3: ; %atomicrmw.global 4217; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 4218; GFX11-NEXT: s_mov_b32 s1, 0 4219; GFX11-NEXT: .LBB18_4: ; %atomicrmw.start 4220; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4221; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4222; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 4223; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4224; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4225; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4226; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 4227; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4228; GFX11-NEXT: buffer_gl1_inv 4229; GFX11-NEXT: buffer_gl0_inv 4230; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 4231; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4232; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4233; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4234; GFX11-NEXT: s_cbranch_execnz .LBB18_4 4235; GFX11-NEXT: ; %bb.5: ; %Flow 4236; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4237; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 4238; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 4239; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4240; GFX11-NEXT: s_cbranch_execz .LBB18_2 4241; GFX11-NEXT: .LBB18_6: ; %atomicrmw.private 4242; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 4243; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 4244; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 4245; GFX11-NEXT: s_waitcnt vmcnt(0) 4246; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4247; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off 4248; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4249; GFX11-NEXT: s_setpc_b64 s[30:31] 4250; 4251; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4252; GFX10: ; %bb.0: 4253; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4254; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 4255; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 4256; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 4257; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 4258; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 4259; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 4260; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 4261; GFX10-NEXT: s_cbranch_execnz .LBB18_3 4262; GFX10-NEXT: ; %bb.1: ; %Flow3 4263; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4264; GFX10-NEXT: s_cbranch_execnz .LBB18_6 4265; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi 4266; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4267; GFX10-NEXT: s_setpc_b64 s[30:31] 4268; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global 4269; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 4270; GFX10-NEXT: s_mov_b32 s5, 0 4271; GFX10-NEXT: .LBB18_4: ; %atomicrmw.start 4272; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4273; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4274; GFX10-NEXT: v_mov_b32_e32 v9, v1 4275; GFX10-NEXT: v_mov_b32_e32 v8, v0 4276; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4277; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4278; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4279; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4280; GFX10-NEXT: buffer_gl1_inv 4281; GFX10-NEXT: buffer_gl0_inv 4282; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 4283; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4284; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4285; GFX10-NEXT: s_cbranch_execnz .LBB18_4 4286; GFX10-NEXT: ; %bb.5: ; %Flow 4287; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4288; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 4289; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 4290; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4291; GFX10-NEXT: s_cbranch_execz .LBB18_2 4292; GFX10-NEXT: .LBB18_6: ; %atomicrmw.private 4293; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 4294; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo 4295; GFX10-NEXT: s_clause 0x1 4296; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4297; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4298; GFX10-NEXT: s_waitcnt vmcnt(0) 4299; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4300; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4301; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 4302; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4303; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4304; GFX10-NEXT: s_setpc_b64 s[30:31] 4305; 4306; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4307; GFX90A: ; %bb.0: 4308; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4309; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 4310; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 4311; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 4312; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 4313; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4314; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4315; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4316; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 4317; GFX90A-NEXT: ; %bb.1: ; %Flow3 4318; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4319; GFX90A-NEXT: s_cbranch_execnz .LBB18_6 4320; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi 4321; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4322; GFX90A-NEXT: s_setpc_b64 s[30:31] 4323; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global 4324; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 4325; GFX90A-NEXT: s_mov_b64 s[6:7], 0 4326; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start 4327; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4328; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4329; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] 4330; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4331; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4332; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4333; GFX90A-NEXT: buffer_wbinvl1 4334; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4335; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4336; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 4337; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 4338; GFX90A-NEXT: ; %bb.5: ; %Flow 4339; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 4340; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 4341; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 4342; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4343; GFX90A-NEXT: s_cbranch_execz .LBB18_2 4344; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private 4345; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4346; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4347; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4348; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4349; GFX90A-NEXT: s_waitcnt vmcnt(0) 4350; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4351; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4352; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 4353; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4354; GFX90A-NEXT: s_waitcnt vmcnt(0) 4355; GFX90A-NEXT: s_setpc_b64 s[30:31] 4356; 4357; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4358; GFX908: ; %bb.0: 4359; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4360; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 4361; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 4362; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 4363; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 4364; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 4365; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 4366; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4367; GFX908-NEXT: s_cbranch_execnz .LBB18_3 4368; GFX908-NEXT: ; %bb.1: ; %Flow3 4369; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4370; GFX908-NEXT: s_cbranch_execnz .LBB18_6 4371; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi 4372; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4373; GFX908-NEXT: s_setpc_b64 s[30:31] 4374; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global 4375; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 4376; GFX908-NEXT: s_mov_b64 s[6:7], 0 4377; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start 4378; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4379; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4380; GFX908-NEXT: v_mov_b32_e32 v9, v1 4381; GFX908-NEXT: v_mov_b32_e32 v8, v0 4382; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4383; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4384; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4385; GFX908-NEXT: buffer_wbinvl1 4386; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4387; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4388; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 4389; GFX908-NEXT: s_cbranch_execnz .LBB18_4 4390; GFX908-NEXT: ; %bb.5: ; %Flow 4391; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4392; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 4393; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 4394; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4395; GFX908-NEXT: s_cbranch_execz .LBB18_2 4396; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private 4397; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4398; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4399; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4400; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4401; GFX908-NEXT: s_waitcnt vmcnt(0) 4402; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4403; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4404; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 4405; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4406; GFX908-NEXT: s_waitcnt vmcnt(0) 4407; GFX908-NEXT: s_setpc_b64 s[30:31] 4408; 4409; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4410; GFX8: ; %bb.0: 4411; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4412; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 4413; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 4414; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 4415; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 4416; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4417; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 4418; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4419; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4420; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4421; GFX8-NEXT: s_cbranch_execnz .LBB18_3 4422; GFX8-NEXT: ; %bb.1: ; %Flow3 4423; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4424; GFX8-NEXT: s_cbranch_execnz .LBB18_6 4425; GFX8-NEXT: .LBB18_2: ; %atomicrmw.phi 4426; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4427; GFX8-NEXT: s_setpc_b64 s[30:31] 4428; GFX8-NEXT: .LBB18_3: ; %atomicrmw.global 4429; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 4430; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 4431; GFX8-NEXT: flat_load_dword v1, v[0:1] 4432; GFX8-NEXT: flat_load_dword v0, v[4:5] 4433; GFX8-NEXT: s_mov_b64 s[6:7], 0 4434; GFX8-NEXT: .LBB18_4: ; %atomicrmw.start 4435; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4436; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4437; GFX8-NEXT: v_mov_b32_e32 v9, v1 4438; GFX8-NEXT: v_mov_b32_e32 v8, v0 4439; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4440; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4441; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4442; GFX8-NEXT: buffer_wbinvl1 4443; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4444; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4445; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 4446; GFX8-NEXT: s_cbranch_execnz .LBB18_4 4447; GFX8-NEXT: ; %bb.5: ; %Flow 4448; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4449; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 4450; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 4451; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4452; GFX8-NEXT: s_cbranch_execz .LBB18_2 4453; GFX8-NEXT: .LBB18_6: ; %atomicrmw.private 4454; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4455; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4456; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 4457; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4458; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4459; GFX8-NEXT: s_waitcnt vmcnt(0) 4460; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4461; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4462; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 4463; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4464; GFX8-NEXT: s_waitcnt vmcnt(0) 4465; GFX8-NEXT: s_setpc_b64 s[30:31] 4466; 4467; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: 4468; GFX7: ; %bb.0: 4469; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4470; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 4471; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 4472; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 4473; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 4474; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4475; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 4476; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 4477; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 4478; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4479; GFX7-NEXT: s_cbranch_execnz .LBB18_3 4480; GFX7-NEXT: ; %bb.1: ; %Flow3 4481; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4482; GFX7-NEXT: s_cbranch_execnz .LBB18_6 4483; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi 4484; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4485; GFX7-NEXT: s_setpc_b64 s[30:31] 4486; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global 4487; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 4488; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 4489; GFX7-NEXT: flat_load_dword v1, v[0:1] 4490; GFX7-NEXT: flat_load_dword v0, v[4:5] 4491; GFX7-NEXT: s_mov_b64 s[6:7], 0 4492; GFX7-NEXT: .LBB18_4: ; %atomicrmw.start 4493; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4494; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4495; GFX7-NEXT: v_mov_b32_e32 v9, v1 4496; GFX7-NEXT: v_mov_b32_e32 v8, v0 4497; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] 4498; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 4499; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4500; GFX7-NEXT: buffer_wbinvl1 4501; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 4502; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4503; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 4504; GFX7-NEXT: s_cbranch_execnz .LBB18_4 4505; GFX7-NEXT: ; %bb.5: ; %Flow 4506; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 4507; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 4508; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 4509; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4510; GFX7-NEXT: s_cbranch_execz .LBB18_2 4511; GFX7-NEXT: .LBB18_6: ; %atomicrmw.private 4512; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 4513; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 4514; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 4515; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4516; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4517; GFX7-NEXT: s_waitcnt vmcnt(0) 4518; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] 4519; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 4520; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 4521; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4522; GFX7-NEXT: s_waitcnt vmcnt(0) 4523; GFX7-NEXT: s_setpc_b64 s[30:31] 4524 %gep = getelementptr double, ptr %ptr, i64 -256 4525 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst 4526 ret double %result 4527} 4528 4529define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { 4530; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64: 4531; GFX12: ; %bb.0: 4532; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4533; GFX12-NEXT: s_wait_expcnt 0x0 4534; GFX12-NEXT: s_wait_samplecnt 0x0 4535; GFX12-NEXT: s_wait_bvhcnt 0x0 4536; GFX12-NEXT: s_wait_kmcnt 0x0 4537; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 4538; GFX12-NEXT: s_mov_b32 s0, exec_lo 4539; GFX12-NEXT: s_wait_alu 0xfffe 4540; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 4541; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 4542; GFX12-NEXT: s_cbranch_execnz .LBB19_3 4543; GFX12-NEXT: ; %bb.1: ; %Flow3 4544; GFX12-NEXT: s_wait_alu 0xfffe 4545; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4546; GFX12-NEXT: s_cbranch_execnz .LBB19_6 4547; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi 4548; GFX12-NEXT: s_wait_alu 0xfffe 4549; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4550; GFX12-NEXT: s_wait_alu 0xfffe 4551; GFX12-NEXT: s_setpc_b64 s[30:31] 4552; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global 4553; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 4554; GFX12-NEXT: s_mov_b32 s1, 0 4555; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start 4556; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4557; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4558; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] 4559; GFX12-NEXT: s_wait_storecnt 0x0 4560; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4561; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4562; GFX12-NEXT: global_inv scope:SCOPE_DEV 4563; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 4564; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 4565; GFX12-NEXT: s_wait_alu 0xfffe 4566; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4567; GFX12-NEXT: s_wait_alu 0xfffe 4568; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4569; GFX12-NEXT: s_cbranch_execnz .LBB19_4 4570; GFX12-NEXT: ; %bb.5: ; %Flow 4571; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4572; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 4573; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 4574; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4575; GFX12-NEXT: s_cbranch_execz .LBB19_2 4576; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private 4577; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4578; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4579; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 4580; GFX12-NEXT: s_wait_loadcnt 0x0 4581; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] 4582; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 4583; GFX12-NEXT: s_wait_alu 0xfffe 4584; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4585; GFX12-NEXT: s_wait_alu 0xfffe 4586; GFX12-NEXT: s_setpc_b64 s[30:31] 4587; 4588; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: 4589; GFX940: ; %bb.0: 4590; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4591; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 4592; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 4593; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 4594; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4595; GFX940-NEXT: s_cbranch_execnz .LBB19_3 4596; GFX940-NEXT: ; %bb.1: ; %Flow3 4597; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4598; GFX940-NEXT: s_cbranch_execnz .LBB19_6 4599; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi 4600; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4601; GFX940-NEXT: s_setpc_b64 s[30:31] 4602; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global 4603; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4604; GFX940-NEXT: s_mov_b64 s[2:3], 0 4605; GFX940-NEXT: .LBB19_4: ; %atomicrmw.start 4606; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 4607; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4608; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4609; GFX940-NEXT: buffer_wbl2 sc1 4610; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 4611; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4612; GFX940-NEXT: buffer_inv sc1 4613; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4614; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 4615; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] 4616; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 4617; GFX940-NEXT: s_cbranch_execnz .LBB19_4 4618; GFX940-NEXT: ; %bb.5: ; %Flow 4619; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 4620; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 4621; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 4622; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4623; GFX940-NEXT: s_cbranch_execz .LBB19_2 4624; GFX940-NEXT: .LBB19_6: ; %atomicrmw.private 4625; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4626; GFX940-NEXT: s_nop 1 4627; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4628; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 4629; GFX940-NEXT: s_waitcnt vmcnt(0) 4630; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4631; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 4632; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4633; GFX940-NEXT: s_waitcnt vmcnt(0) 4634; GFX940-NEXT: s_setpc_b64 s[30:31] 4635; 4636; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64: 4637; GFX11: ; %bb.0: 4638; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4639; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 4640; GFX11-NEXT: s_mov_b32 s0, exec_lo 4641; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 4642; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 4643; GFX11-NEXT: s_cbranch_execnz .LBB19_3 4644; GFX11-NEXT: ; %bb.1: ; %Flow3 4645; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4646; GFX11-NEXT: s_cbranch_execnz .LBB19_6 4647; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi 4648; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4649; GFX11-NEXT: s_setpc_b64 s[30:31] 4650; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global 4651; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 4652; GFX11-NEXT: s_mov_b32 s1, 0 4653; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start 4654; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4655; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4656; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4657; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4658; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 4659; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4660; GFX11-NEXT: buffer_gl1_inv 4661; GFX11-NEXT: buffer_gl0_inv 4662; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 4663; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 4664; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4665; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4666; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4667; GFX11-NEXT: s_cbranch_execnz .LBB19_4 4668; GFX11-NEXT: ; %bb.5: ; %Flow 4669; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4670; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 4671; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 4672; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4673; GFX11-NEXT: s_cbranch_execz .LBB19_2 4674; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private 4675; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4676; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4677; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 4678; GFX11-NEXT: s_waitcnt vmcnt(0) 4679; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4680; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 4681; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4682; GFX11-NEXT: s_setpc_b64 s[30:31] 4683; 4684; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64: 4685; GFX10: ; %bb.0: 4686; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4687; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 4688; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 4689; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 4690; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 4691; GFX10-NEXT: s_cbranch_execnz .LBB19_3 4692; GFX10-NEXT: ; %bb.1: ; %Flow3 4693; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4694; GFX10-NEXT: s_cbranch_execnz .LBB19_6 4695; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi 4696; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4697; GFX10-NEXT: s_setpc_b64 s[30:31] 4698; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global 4699; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4700; GFX10-NEXT: s_mov_b32 s5, 0 4701; GFX10-NEXT: .LBB19_4: ; %atomicrmw.start 4702; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4703; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4704; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4705; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4706; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4707; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4708; GFX10-NEXT: buffer_gl1_inv 4709; GFX10-NEXT: buffer_gl0_inv 4710; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 4711; GFX10-NEXT: v_mov_b32_e32 v7, v5 4712; GFX10-NEXT: v_mov_b32_e32 v6, v4 4713; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4714; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4715; GFX10-NEXT: s_cbranch_execnz .LBB19_4 4716; GFX10-NEXT: ; %bb.5: ; %Flow 4717; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4718; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 4719; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 4720; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4721; GFX10-NEXT: s_cbranch_execz .LBB19_2 4722; GFX10-NEXT: .LBB19_6: ; %atomicrmw.private 4723; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4724; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4725; GFX10-NEXT: s_clause 0x1 4726; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4727; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4728; GFX10-NEXT: s_waitcnt vmcnt(0) 4729; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4730; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4731; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4732; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4733; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4734; GFX10-NEXT: s_setpc_b64 s[30:31] 4735; 4736; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64: 4737; GFX90A: ; %bb.0: 4738; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4739; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 4740; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4741; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4742; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4743; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 4744; GFX90A-NEXT: ; %bb.1: ; %Flow3 4745; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4746; GFX90A-NEXT: s_cbranch_execnz .LBB19_6 4747; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi 4748; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4749; GFX90A-NEXT: s_setpc_b64 s[30:31] 4750; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global 4751; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4752; GFX90A-NEXT: s_mov_b64 s[6:7], 0 4753; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.start 4754; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4755; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4756; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4757; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4758; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4759; GFX90A-NEXT: buffer_wbinvl1 4760; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4761; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4762; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] 4763; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 4764; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 4765; GFX90A-NEXT: ; %bb.5: ; %Flow 4766; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 4767; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4768; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 4769; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4770; GFX90A-NEXT: s_cbranch_execz .LBB19_2 4771; GFX90A-NEXT: .LBB19_6: ; %atomicrmw.private 4772; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4773; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4774; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4775; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4776; GFX90A-NEXT: s_waitcnt vmcnt(0) 4777; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4778; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4779; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4780; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4781; GFX90A-NEXT: s_waitcnt vmcnt(0) 4782; GFX90A-NEXT: s_setpc_b64 s[30:31] 4783; 4784; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64: 4785; GFX908: ; %bb.0: 4786; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4787; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 4788; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4789; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 4790; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4791; GFX908-NEXT: s_cbranch_execnz .LBB19_3 4792; GFX908-NEXT: ; %bb.1: ; %Flow3 4793; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4794; GFX908-NEXT: s_cbranch_execnz .LBB19_6 4795; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi 4796; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4797; GFX908-NEXT: s_setpc_b64 s[30:31] 4798; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global 4799; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 4800; GFX908-NEXT: s_mov_b64 s[6:7], 0 4801; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start 4802; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4803; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4804; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4805; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4806; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4807; GFX908-NEXT: buffer_wbinvl1 4808; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4809; GFX908-NEXT: v_mov_b32_e32 v7, v5 4810; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4811; GFX908-NEXT: v_mov_b32_e32 v6, v4 4812; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 4813; GFX908-NEXT: s_cbranch_execnz .LBB19_4 4814; GFX908-NEXT: ; %bb.5: ; %Flow 4815; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4816; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 4817; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 4818; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4819; GFX908-NEXT: s_cbranch_execz .LBB19_2 4820; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private 4821; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4822; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4823; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4824; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4825; GFX908-NEXT: s_waitcnt vmcnt(0) 4826; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4827; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4828; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4829; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4830; GFX908-NEXT: s_waitcnt vmcnt(0) 4831; GFX908-NEXT: s_setpc_b64 s[30:31] 4832; 4833; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64: 4834; GFX8: ; %bb.0: 4835; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4836; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 4837; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 4838; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4839; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 4840; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4841; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4842; GFX8-NEXT: s_cbranch_execnz .LBB19_3 4843; GFX8-NEXT: ; %bb.1: ; %Flow3 4844; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4845; GFX8-NEXT: s_cbranch_execnz .LBB19_6 4846; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi 4847; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4848; GFX8-NEXT: s_setpc_b64 s[30:31] 4849; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global 4850; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 4851; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 4852; GFX8-NEXT: flat_load_dword v7, v[4:5] 4853; GFX8-NEXT: flat_load_dword v6, v[0:1] 4854; GFX8-NEXT: s_mov_b64 s[6:7], 0 4855; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start 4856; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4857; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4858; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4859; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4860; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4861; GFX8-NEXT: buffer_wbinvl1 4862; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4863; GFX8-NEXT: v_mov_b32_e32 v7, v5 4864; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4865; GFX8-NEXT: v_mov_b32_e32 v6, v4 4866; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 4867; GFX8-NEXT: s_cbranch_execnz .LBB19_4 4868; GFX8-NEXT: ; %bb.5: ; %Flow 4869; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4870; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4871; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 4872; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4873; GFX8-NEXT: s_cbranch_execz .LBB19_2 4874; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private 4875; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4876; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4877; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 4878; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4879; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4880; GFX8-NEXT: s_waitcnt vmcnt(0) 4881; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4882; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4883; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 4884; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4885; GFX8-NEXT: s_waitcnt vmcnt(0) 4886; GFX8-NEXT: s_setpc_b64 s[30:31] 4887; 4888; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64: 4889; GFX7: ; %bb.0: 4890; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4891; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 4892; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 4893; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4894; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 4895; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 4896; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4897; GFX7-NEXT: s_cbranch_execnz .LBB19_3 4898; GFX7-NEXT: ; %bb.1: ; %Flow3 4899; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4900; GFX7-NEXT: s_cbranch_execnz .LBB19_6 4901; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi 4902; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4903; GFX7-NEXT: s_setpc_b64 s[30:31] 4904; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global 4905; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 4906; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 4907; GFX7-NEXT: flat_load_dword v7, v[4:5] 4908; GFX7-NEXT: flat_load_dword v6, v[0:1] 4909; GFX7-NEXT: s_mov_b64 s[6:7], 0 4910; GFX7-NEXT: .LBB19_4: ; %atomicrmw.start 4911; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4912; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4913; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 4914; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 4915; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4916; GFX7-NEXT: buffer_wbinvl1 4917; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 4918; GFX7-NEXT: v_mov_b32_e32 v7, v5 4919; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4920; GFX7-NEXT: v_mov_b32_e32 v6, v4 4921; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 4922; GFX7-NEXT: s_cbranch_execnz .LBB19_4 4923; GFX7-NEXT: ; %bb.5: ; %Flow 4924; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 4925; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 4926; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 4927; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4928; GFX7-NEXT: s_cbranch_execz .LBB19_2 4929; GFX7-NEXT: .LBB19_6: ; %atomicrmw.private 4930; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4931; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4932; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 4933; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4934; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4935; GFX7-NEXT: s_waitcnt vmcnt(0) 4936; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 4937; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4938; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 4939; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4940; GFX7-NEXT: s_waitcnt vmcnt(0) 4941; GFX7-NEXT: s_setpc_b64 s[30:31] 4942 %unused = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst 4943 ret void 4944} 4945 4946define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 { 4947; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 4948; GFX12: ; %bb.0: 4949; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4950; GFX12-NEXT: s_wait_expcnt 0x0 4951; GFX12-NEXT: s_wait_samplecnt 0x0 4952; GFX12-NEXT: s_wait_bvhcnt 0x0 4953; GFX12-NEXT: s_wait_kmcnt 0x0 4954; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 4955; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 4956; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 4957; GFX12-NEXT: s_mov_b32 s0, exec_lo 4958; GFX12-NEXT: s_wait_alu 0xfffe 4959; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4960; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 4961; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 4962; GFX12-NEXT: s_cbranch_execnz .LBB20_3 4963; GFX12-NEXT: ; %bb.1: ; %Flow3 4964; GFX12-NEXT: s_wait_alu 0xfffe 4965; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4966; GFX12-NEXT: s_cbranch_execnz .LBB20_6 4967; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi 4968; GFX12-NEXT: s_wait_alu 0xfffe 4969; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4970; GFX12-NEXT: s_wait_alu 0xfffe 4971; GFX12-NEXT: s_setpc_b64 s[30:31] 4972; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global 4973; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 4974; GFX12-NEXT: s_mov_b32 s1, 0 4975; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start 4976; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4977; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4978; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] 4979; GFX12-NEXT: s_wait_storecnt 0x0 4980; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4981; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4982; GFX12-NEXT: global_inv scope:SCOPE_DEV 4983; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 4984; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 4985; GFX12-NEXT: s_wait_alu 0xfffe 4986; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4987; GFX12-NEXT: s_wait_alu 0xfffe 4988; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4989; GFX12-NEXT: s_cbranch_execnz .LBB20_4 4990; GFX12-NEXT: ; %bb.5: ; %Flow 4991; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4992; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 4993; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 4994; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4995; GFX12-NEXT: s_cbranch_execz .LBB20_2 4996; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private 4997; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4998; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4999; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 5000; GFX12-NEXT: s_wait_loadcnt 0x0 5001; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] 5002; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 5003; GFX12-NEXT: s_wait_alu 0xfffe 5004; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5005; GFX12-NEXT: s_wait_alu 0xfffe 5006; GFX12-NEXT: s_setpc_b64 s[30:31] 5007; 5008; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5009; GFX940: ; %bb.0: 5010; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5011; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 5012; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 5013; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 5014; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 5015; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 5016; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 5017; GFX940-NEXT: s_cbranch_execnz .LBB20_3 5018; GFX940-NEXT: ; %bb.1: ; %Flow3 5019; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5020; GFX940-NEXT: s_cbranch_execnz .LBB20_6 5021; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi 5022; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5023; GFX940-NEXT: s_setpc_b64 s[30:31] 5024; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global 5025; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5026; GFX940-NEXT: s_mov_b64 s[2:3], 0 5027; GFX940-NEXT: .LBB20_4: ; %atomicrmw.start 5028; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5029; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5030; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5031; GFX940-NEXT: buffer_wbl2 sc1 5032; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 5033; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5034; GFX940-NEXT: buffer_inv sc1 5035; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5036; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 5037; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] 5038; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 5039; GFX940-NEXT: s_cbranch_execnz .LBB20_4 5040; GFX940-NEXT: ; %bb.5: ; %Flow 5041; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 5042; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 5043; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 5044; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5045; GFX940-NEXT: s_cbranch_execz .LBB20_2 5046; GFX940-NEXT: .LBB20_6: ; %atomicrmw.private 5047; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5048; GFX940-NEXT: s_nop 1 5049; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5050; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 5051; GFX940-NEXT: s_waitcnt vmcnt(0) 5052; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5053; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 5054; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5055; GFX940-NEXT: s_waitcnt vmcnt(0) 5056; GFX940-NEXT: s_setpc_b64 s[30:31] 5057; 5058; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5059; GFX11: ; %bb.0: 5060; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5061; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 5062; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 5063; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 5064; GFX11-NEXT: s_mov_b32 s0, exec_lo 5065; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5066; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 5067; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 5068; GFX11-NEXT: s_cbranch_execnz .LBB20_3 5069; GFX11-NEXT: ; %bb.1: ; %Flow3 5070; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5071; GFX11-NEXT: s_cbranch_execnz .LBB20_6 5072; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi 5073; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5074; GFX11-NEXT: s_setpc_b64 s[30:31] 5075; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global 5076; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 5077; GFX11-NEXT: s_mov_b32 s1, 0 5078; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start 5079; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5080; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5081; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5082; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5083; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 5084; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5085; GFX11-NEXT: buffer_gl1_inv 5086; GFX11-NEXT: buffer_gl0_inv 5087; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5088; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 5089; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5090; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5091; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5092; GFX11-NEXT: s_cbranch_execnz .LBB20_4 5093; GFX11-NEXT: ; %bb.5: ; %Flow 5094; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5095; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 5096; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 5097; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5098; GFX11-NEXT: s_cbranch_execz .LBB20_2 5099; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private 5100; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5101; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 5102; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 5103; GFX11-NEXT: s_waitcnt vmcnt(0) 5104; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5105; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 5106; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5107; GFX11-NEXT: s_setpc_b64 s[30:31] 5108; 5109; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5110; GFX10: ; %bb.0: 5111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5112; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 5113; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 5114; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 5115; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 5116; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 5117; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 5118; GFX10-NEXT: s_cbranch_execnz .LBB20_3 5119; GFX10-NEXT: ; %bb.1: ; %Flow3 5120; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5121; GFX10-NEXT: s_cbranch_execnz .LBB20_6 5122; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi 5123; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5124; GFX10-NEXT: s_setpc_b64 s[30:31] 5125; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global 5126; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5127; GFX10-NEXT: s_mov_b32 s5, 0 5128; GFX10-NEXT: .LBB20_4: ; %atomicrmw.start 5129; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5130; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5131; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5132; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5133; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5134; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5135; GFX10-NEXT: buffer_gl1_inv 5136; GFX10-NEXT: buffer_gl0_inv 5137; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5138; GFX10-NEXT: v_mov_b32_e32 v7, v5 5139; GFX10-NEXT: v_mov_b32_e32 v6, v4 5140; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5141; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5142; GFX10-NEXT: s_cbranch_execnz .LBB20_4 5143; GFX10-NEXT: ; %bb.5: ; %Flow 5144; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5145; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 5146; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 5147; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5148; GFX10-NEXT: s_cbranch_execz .LBB20_2 5149; GFX10-NEXT: .LBB20_6: ; %atomicrmw.private 5150; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5151; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 5152; GFX10-NEXT: s_clause 0x1 5153; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5154; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5155; GFX10-NEXT: s_waitcnt vmcnt(0) 5156; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5157; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5158; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5159; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5160; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5161; GFX10-NEXT: s_setpc_b64 s[30:31] 5162; 5163; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5164; GFX90A: ; %bb.0: 5165; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5166; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 5167; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 5168; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 5169; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5170; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5171; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5172; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 5173; GFX90A-NEXT: ; %bb.1: ; %Flow3 5174; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5175; GFX90A-NEXT: s_cbranch_execnz .LBB20_6 5176; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi 5177; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5178; GFX90A-NEXT: s_setpc_b64 s[30:31] 5179; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global 5180; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5181; GFX90A-NEXT: s_mov_b64 s[6:7], 0 5182; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.start 5183; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5184; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5185; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5186; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5187; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5188; GFX90A-NEXT: buffer_wbinvl1 5189; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5190; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5191; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] 5192; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 5193; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 5194; GFX90A-NEXT: ; %bb.5: ; %Flow 5195; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5196; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5197; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 5198; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5199; GFX90A-NEXT: s_cbranch_execz .LBB20_2 5200; GFX90A-NEXT: .LBB20_6: ; %atomicrmw.private 5201; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5202; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5203; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5204; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5205; GFX90A-NEXT: s_waitcnt vmcnt(0) 5206; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5207; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5208; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5209; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5210; GFX90A-NEXT: s_waitcnt vmcnt(0) 5211; GFX90A-NEXT: s_setpc_b64 s[30:31] 5212; 5213; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5214; GFX908: ; %bb.0: 5215; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5216; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 5217; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 5218; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5219; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5220; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5221; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5222; GFX908-NEXT: s_cbranch_execnz .LBB20_3 5223; GFX908-NEXT: ; %bb.1: ; %Flow3 5224; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5225; GFX908-NEXT: s_cbranch_execnz .LBB20_6 5226; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi 5227; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5228; GFX908-NEXT: s_setpc_b64 s[30:31] 5229; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global 5230; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5231; GFX908-NEXT: s_mov_b64 s[6:7], 0 5232; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start 5233; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5234; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5235; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5236; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5237; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5238; GFX908-NEXT: buffer_wbinvl1 5239; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5240; GFX908-NEXT: v_mov_b32_e32 v7, v5 5241; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5242; GFX908-NEXT: v_mov_b32_e32 v6, v4 5243; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5244; GFX908-NEXT: s_cbranch_execnz .LBB20_4 5245; GFX908-NEXT: ; %bb.5: ; %Flow 5246; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5247; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5248; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 5249; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5250; GFX908-NEXT: s_cbranch_execz .LBB20_2 5251; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private 5252; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5253; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5254; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5255; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5256; GFX908-NEXT: s_waitcnt vmcnt(0) 5257; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5258; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5259; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5260; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5261; GFX908-NEXT: s_waitcnt vmcnt(0) 5262; GFX908-NEXT: s_setpc_b64 s[30:31] 5263; 5264; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5265; GFX8: ; %bb.0: 5266; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5267; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5268; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5269; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 5270; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5271; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5272; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5273; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5274; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5275; GFX8-NEXT: s_cbranch_execnz .LBB20_3 5276; GFX8-NEXT: ; %bb.1: ; %Flow3 5277; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5278; GFX8-NEXT: s_cbranch_execnz .LBB20_6 5279; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi 5280; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5281; GFX8-NEXT: s_setpc_b64 s[30:31] 5282; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global 5283; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 5284; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5285; GFX8-NEXT: flat_load_dword v7, v[4:5] 5286; GFX8-NEXT: flat_load_dword v6, v[0:1] 5287; GFX8-NEXT: s_mov_b64 s[6:7], 0 5288; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start 5289; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5290; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5291; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5292; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5293; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5294; GFX8-NEXT: buffer_wbinvl1 5295; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5296; GFX8-NEXT: v_mov_b32_e32 v7, v5 5297; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5298; GFX8-NEXT: v_mov_b32_e32 v6, v4 5299; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5300; GFX8-NEXT: s_cbranch_execnz .LBB20_4 5301; GFX8-NEXT: ; %bb.5: ; %Flow 5302; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5303; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5304; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 5305; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5306; GFX8-NEXT: s_cbranch_execz .LBB20_2 5307; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private 5308; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5309; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5310; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 5311; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5312; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 5313; GFX8-NEXT: s_waitcnt vmcnt(0) 5314; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5315; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5316; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 5317; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5318; GFX8-NEXT: s_waitcnt vmcnt(0) 5319; GFX8-NEXT: s_setpc_b64 s[30:31] 5320; 5321; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: 5322; GFX7: ; %bb.0: 5323; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5324; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 5325; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 5326; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 5327; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5328; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5329; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5330; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 5331; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5332; GFX7-NEXT: s_cbranch_execnz .LBB20_3 5333; GFX7-NEXT: ; %bb.1: ; %Flow3 5334; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5335; GFX7-NEXT: s_cbranch_execnz .LBB20_6 5336; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi 5337; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5338; GFX7-NEXT: s_setpc_b64 s[30:31] 5339; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global 5340; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 5341; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5342; GFX7-NEXT: flat_load_dword v7, v[4:5] 5343; GFX7-NEXT: flat_load_dword v6, v[0:1] 5344; GFX7-NEXT: s_mov_b64 s[6:7], 0 5345; GFX7-NEXT: .LBB20_4: ; %atomicrmw.start 5346; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5347; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5348; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5349; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5350; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5351; GFX7-NEXT: buffer_wbinvl1 5352; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5353; GFX7-NEXT: v_mov_b32_e32 v7, v5 5354; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5355; GFX7-NEXT: v_mov_b32_e32 v6, v4 5356; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 5357; GFX7-NEXT: s_cbranch_execnz .LBB20_4 5358; GFX7-NEXT: ; %bb.5: ; %Flow 5359; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 5360; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 5361; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 5362; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5363; GFX7-NEXT: s_cbranch_execz .LBB20_2 5364; GFX7-NEXT: .LBB20_6: ; %atomicrmw.private 5365; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5366; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5367; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 5368; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5369; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 5370; GFX7-NEXT: s_waitcnt vmcnt(0) 5371; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5372; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5373; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 5374; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5375; GFX7-NEXT: s_waitcnt vmcnt(0) 5376; GFX7-NEXT: s_setpc_b64 s[30:31] 5377 %gep = getelementptr double, ptr %ptr, i64 255 5378 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst 5379 ret void 5380} 5381 5382define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 { 5383; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5384; GFX12: ; %bb.0: 5385; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5386; GFX12-NEXT: s_wait_expcnt 0x0 5387; GFX12-NEXT: s_wait_samplecnt 0x0 5388; GFX12-NEXT: s_wait_bvhcnt 0x0 5389; GFX12-NEXT: s_wait_kmcnt 0x0 5390; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 5391; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 5392; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 5393; GFX12-NEXT: s_mov_b32 s0, exec_lo 5394; GFX12-NEXT: s_wait_alu 0xfffe 5395; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5396; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 5397; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 5398; GFX12-NEXT: s_cbranch_execnz .LBB21_3 5399; GFX12-NEXT: ; %bb.1: ; %Flow3 5400; GFX12-NEXT: s_wait_alu 0xfffe 5401; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 5402; GFX12-NEXT: s_cbranch_execnz .LBB21_6 5403; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi 5404; GFX12-NEXT: s_wait_alu 0xfffe 5405; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5406; GFX12-NEXT: s_wait_alu 0xfffe 5407; GFX12-NEXT: s_setpc_b64 s[30:31] 5408; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global 5409; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] 5410; GFX12-NEXT: s_mov_b32 s1, 0 5411; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start 5412; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5413; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5414; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] 5415; GFX12-NEXT: s_wait_storecnt 0x0 5416; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5417; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5418; GFX12-NEXT: global_inv scope:SCOPE_DEV 5419; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5420; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 5421; GFX12-NEXT: s_wait_alu 0xfffe 5422; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 5423; GFX12-NEXT: s_wait_alu 0xfffe 5424; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5425; GFX12-NEXT: s_cbranch_execnz .LBB21_4 5426; GFX12-NEXT: ; %bb.5: ; %Flow 5427; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 5428; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 5429; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 5430; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 5431; GFX12-NEXT: s_cbranch_execz .LBB21_2 5432; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private 5433; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5434; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 5435; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off 5436; GFX12-NEXT: s_wait_loadcnt 0x0 5437; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] 5438; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off 5439; GFX12-NEXT: s_wait_alu 0xfffe 5440; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5441; GFX12-NEXT: s_wait_alu 0xfffe 5442; GFX12-NEXT: s_setpc_b64 s[30:31] 5443; 5444; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5445; GFX940: ; %bb.0: 5446; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5447; GFX940-NEXT: s_movk_i32 s0, 0xf800 5448; GFX940-NEXT: s_mov_b32 s1, -1 5449; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 5450; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 5451; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 5452; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 5453; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 5454; GFX940-NEXT: s_cbranch_execnz .LBB21_3 5455; GFX940-NEXT: ; %bb.1: ; %Flow3 5456; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5457; GFX940-NEXT: s_cbranch_execnz .LBB21_6 5458; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi 5459; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5460; GFX940-NEXT: s_setpc_b64 s[30:31] 5461; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global 5462; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5463; GFX940-NEXT: s_mov_b64 s[2:3], 0 5464; GFX940-NEXT: .LBB21_4: ; %atomicrmw.start 5465; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5466; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5467; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5468; GFX940-NEXT: buffer_wbl2 sc1 5469; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 5470; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5471; GFX940-NEXT: buffer_inv sc1 5472; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5473; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 5474; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] 5475; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 5476; GFX940-NEXT: s_cbranch_execnz .LBB21_4 5477; GFX940-NEXT: ; %bb.5: ; %Flow 5478; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 5479; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 5480; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 5481; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5482; GFX940-NEXT: s_cbranch_execz .LBB21_2 5483; GFX940-NEXT: .LBB21_6: ; %atomicrmw.private 5484; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5485; GFX940-NEXT: s_nop 1 5486; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5487; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 5488; GFX940-NEXT: s_waitcnt vmcnt(0) 5489; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5490; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 5491; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5492; GFX940-NEXT: s_waitcnt vmcnt(0) 5493; GFX940-NEXT: s_setpc_b64 s[30:31] 5494; 5495; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5496; GFX11: ; %bb.0: 5497; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5498; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 5499; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 5500; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 5501; GFX11-NEXT: s_mov_b32 s0, exec_lo 5502; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5503; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 5504; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 5505; GFX11-NEXT: s_cbranch_execnz .LBB21_3 5506; GFX11-NEXT: ; %bb.1: ; %Flow3 5507; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5508; GFX11-NEXT: s_cbranch_execnz .LBB21_6 5509; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi 5510; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5511; GFX11-NEXT: s_setpc_b64 s[30:31] 5512; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global 5513; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] 5514; GFX11-NEXT: s_mov_b32 s1, 0 5515; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start 5516; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5517; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5518; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5519; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5520; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 5521; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5522; GFX11-NEXT: buffer_gl1_inv 5523; GFX11-NEXT: buffer_gl0_inv 5524; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5525; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 5526; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5527; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5528; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5529; GFX11-NEXT: s_cbranch_execnz .LBB21_4 5530; GFX11-NEXT: ; %bb.5: ; %Flow 5531; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5532; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 5533; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 5534; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5535; GFX11-NEXT: s_cbranch_execz .LBB21_2 5536; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private 5537; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5538; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 5539; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off 5540; GFX11-NEXT: s_waitcnt vmcnt(0) 5541; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5542; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off 5543; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5544; GFX11-NEXT: s_setpc_b64 s[30:31] 5545; 5546; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5547; GFX10: ; %bb.0: 5548; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5549; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 5550; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 5551; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 5552; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 5553; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 5554; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 5555; GFX10-NEXT: s_cbranch_execnz .LBB21_3 5556; GFX10-NEXT: ; %bb.1: ; %Flow3 5557; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5558; GFX10-NEXT: s_cbranch_execnz .LBB21_6 5559; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi 5560; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5561; GFX10-NEXT: s_setpc_b64 s[30:31] 5562; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global 5563; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5564; GFX10-NEXT: s_mov_b32 s5, 0 5565; GFX10-NEXT: .LBB21_4: ; %atomicrmw.start 5566; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5567; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5568; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5569; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5570; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5571; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5572; GFX10-NEXT: buffer_gl1_inv 5573; GFX10-NEXT: buffer_gl0_inv 5574; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 5575; GFX10-NEXT: v_mov_b32_e32 v7, v5 5576; GFX10-NEXT: v_mov_b32_e32 v6, v4 5577; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5578; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5579; GFX10-NEXT: s_cbranch_execnz .LBB21_4 5580; GFX10-NEXT: ; %bb.5: ; %Flow 5581; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5582; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 5583; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 5584; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5585; GFX10-NEXT: s_cbranch_execz .LBB21_2 5586; GFX10-NEXT: .LBB21_6: ; %atomicrmw.private 5587; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5588; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 5589; GFX10-NEXT: s_clause 0x1 5590; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5591; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5592; GFX10-NEXT: s_waitcnt vmcnt(0) 5593; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5594; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5595; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5596; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5597; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5598; GFX10-NEXT: s_setpc_b64 s[30:31] 5599; 5600; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5601; GFX90A: ; %bb.0: 5602; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5603; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 5604; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 5605; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 5606; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5607; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5608; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5609; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 5610; GFX90A-NEXT: ; %bb.1: ; %Flow3 5611; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5612; GFX90A-NEXT: s_cbranch_execnz .LBB21_6 5613; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi 5614; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5615; GFX90A-NEXT: s_setpc_b64 s[30:31] 5616; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global 5617; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5618; GFX90A-NEXT: s_mov_b64 s[6:7], 0 5619; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.start 5620; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5621; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5622; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5623; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5624; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5625; GFX90A-NEXT: buffer_wbinvl1 5626; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5627; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5628; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] 5629; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 5630; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 5631; GFX90A-NEXT: ; %bb.5: ; %Flow 5632; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5633; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5634; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 5635; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5636; GFX90A-NEXT: s_cbranch_execz .LBB21_2 5637; GFX90A-NEXT: .LBB21_6: ; %atomicrmw.private 5638; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5639; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5640; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5641; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5642; GFX90A-NEXT: s_waitcnt vmcnt(0) 5643; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5644; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5645; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5646; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5647; GFX90A-NEXT: s_waitcnt vmcnt(0) 5648; GFX90A-NEXT: s_setpc_b64 s[30:31] 5649; 5650; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5651; GFX908: ; %bb.0: 5652; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5653; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 5654; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 5655; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5656; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5657; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5658; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5659; GFX908-NEXT: s_cbranch_execnz .LBB21_3 5660; GFX908-NEXT: ; %bb.1: ; %Flow3 5661; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5662; GFX908-NEXT: s_cbranch_execnz .LBB21_6 5663; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi 5664; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5665; GFX908-NEXT: s_setpc_b64 s[30:31] 5666; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global 5667; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 5668; GFX908-NEXT: s_mov_b64 s[6:7], 0 5669; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start 5670; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5671; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5672; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5673; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5674; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5675; GFX908-NEXT: buffer_wbinvl1 5676; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5677; GFX908-NEXT: v_mov_b32_e32 v7, v5 5678; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5679; GFX908-NEXT: v_mov_b32_e32 v6, v4 5680; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5681; GFX908-NEXT: s_cbranch_execnz .LBB21_4 5682; GFX908-NEXT: ; %bb.5: ; %Flow 5683; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5684; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5685; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 5686; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5687; GFX908-NEXT: s_cbranch_execz .LBB21_2 5688; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private 5689; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5690; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5691; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5692; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5693; GFX908-NEXT: s_waitcnt vmcnt(0) 5694; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5695; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5696; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5697; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5698; GFX908-NEXT: s_waitcnt vmcnt(0) 5699; GFX908-NEXT: s_setpc_b64 s[30:31] 5700; 5701; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5702; GFX8: ; %bb.0: 5703; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5704; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5705; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5706; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 5707; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 5708; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5709; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5710; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5711; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5712; GFX8-NEXT: s_cbranch_execnz .LBB21_3 5713; GFX8-NEXT: ; %bb.1: ; %Flow3 5714; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5715; GFX8-NEXT: s_cbranch_execnz .LBB21_6 5716; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi 5717; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5718; GFX8-NEXT: s_setpc_b64 s[30:31] 5719; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global 5720; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 5721; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5722; GFX8-NEXT: flat_load_dword v7, v[4:5] 5723; GFX8-NEXT: flat_load_dword v6, v[0:1] 5724; GFX8-NEXT: s_mov_b64 s[6:7], 0 5725; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start 5726; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5727; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5728; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5729; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5730; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5731; GFX8-NEXT: buffer_wbinvl1 5732; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5733; GFX8-NEXT: v_mov_b32_e32 v7, v5 5734; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5735; GFX8-NEXT: v_mov_b32_e32 v6, v4 5736; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5737; GFX8-NEXT: s_cbranch_execnz .LBB21_4 5738; GFX8-NEXT: ; %bb.5: ; %Flow 5739; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5740; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5741; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 5742; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5743; GFX8-NEXT: s_cbranch_execz .LBB21_2 5744; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private 5745; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5746; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5747; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 5748; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5749; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 5750; GFX8-NEXT: s_waitcnt vmcnt(0) 5751; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5752; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5753; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 5754; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5755; GFX8-NEXT: s_waitcnt vmcnt(0) 5756; GFX8-NEXT: s_setpc_b64 s[30:31] 5757; 5758; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: 5759; GFX7: ; %bb.0: 5760; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5761; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 5762; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 5763; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 5764; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 5765; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5766; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5767; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 5768; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5769; GFX7-NEXT: s_cbranch_execnz .LBB21_3 5770; GFX7-NEXT: ; %bb.1: ; %Flow3 5771; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5772; GFX7-NEXT: s_cbranch_execnz .LBB21_6 5773; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi 5774; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5775; GFX7-NEXT: s_setpc_b64 s[30:31] 5776; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global 5777; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 5778; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 5779; GFX7-NEXT: flat_load_dword v7, v[4:5] 5780; GFX7-NEXT: flat_load_dword v6, v[0:1] 5781; GFX7-NEXT: s_mov_b64 s[6:7], 0 5782; GFX7-NEXT: .LBB21_4: ; %atomicrmw.start 5783; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5784; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5785; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] 5786; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 5787; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5788; GFX7-NEXT: buffer_wbinvl1 5789; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 5790; GFX7-NEXT: v_mov_b32_e32 v7, v5 5791; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5792; GFX7-NEXT: v_mov_b32_e32 v6, v4 5793; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 5794; GFX7-NEXT: s_cbranch_execnz .LBB21_4 5795; GFX7-NEXT: ; %bb.5: ; %Flow 5796; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 5797; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 5798; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 5799; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5800; GFX7-NEXT: s_cbranch_execz .LBB21_2 5801; GFX7-NEXT: .LBB21_6: ; %atomicrmw.private 5802; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5803; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5804; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 5805; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5806; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 5807; GFX7-NEXT: s_waitcnt vmcnt(0) 5808; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 5809; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5810; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 5811; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5812; GFX7-NEXT: s_waitcnt vmcnt(0) 5813; GFX7-NEXT: s_setpc_b64 s[30:31] 5814 %gep = getelementptr double, ptr %ptr, i64 -256 5815 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst 5816 ret void 5817} 5818 5819; -------------------------------------------------------------------- 5820; half 5821; -------------------------------------------------------------------- 5822 5823define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { 5824; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16: 5825; GFX12: ; %bb.0: 5826; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5827; GFX12-NEXT: s_wait_expcnt 0x0 5828; GFX12-NEXT: s_wait_samplecnt 0x0 5829; GFX12-NEXT: s_wait_bvhcnt 0x0 5830; GFX12-NEXT: s_wait_kmcnt 0x0 5831; GFX12-NEXT: v_mov_b32_e32 v3, v0 5832; GFX12-NEXT: s_mov_b32 s0, 0 5833; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 5834; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 5835; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 5836; GFX12-NEXT: flat_load_b32 v5, v[0:1] 5837; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 5838; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 5839; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5840; GFX12-NEXT: v_not_b32_e32 v4, v4 5841; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start 5842; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5843; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5844; GFX12-NEXT: v_mov_b32_e32 v6, v5 5845; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5846; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 5847; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 5848; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5849; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 5850; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 5851; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5852; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 5853; GFX12-NEXT: s_wait_storecnt 0x0 5854; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5855; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5856; GFX12-NEXT: global_inv scope:SCOPE_DEV 5857; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 5858; GFX12-NEXT: s_wait_alu 0xfffe 5859; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 5860; GFX12-NEXT: s_wait_alu 0xfffe 5861; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 5862; GFX12-NEXT: s_cbranch_execnz .LBB22_1 5863; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 5864; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5865; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 5866; GFX12-NEXT: s_wait_alu 0xfffe 5867; GFX12-NEXT: s_setpc_b64 s[30:31] 5868; 5869; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16: 5870; GFX940: ; %bb.0: 5871; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5872; GFX940-NEXT: v_mov_b32_e32 v3, v0 5873; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 5874; GFX940-NEXT: flat_load_dword v4, v[0:1] 5875; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 5876; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 5877; GFX940-NEXT: s_mov_b32 s0, 0xffff 5878; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 5879; GFX940-NEXT: v_not_b32_e32 v5, v5 5880; GFX940-NEXT: s_mov_b64 s[0:1], 0 5881; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start 5882; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5883; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5884; GFX940-NEXT: v_mov_b32_e32 v7, v4 5885; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 5886; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 5887; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 5888; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 5889; GFX940-NEXT: buffer_wbl2 sc1 5890; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 5891; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5892; GFX940-NEXT: buffer_inv sc1 5893; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 5894; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 5895; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 5896; GFX940-NEXT: s_cbranch_execnz .LBB22_1 5897; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 5898; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5899; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 5900; GFX940-NEXT: s_setpc_b64 s[30:31] 5901; 5902; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16: 5903; GFX11: ; %bb.0: 5904; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5905; GFX11-NEXT: v_mov_b32_e32 v3, v0 5906; GFX11-NEXT: s_mov_b32 s0, 0 5907; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 5908; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 5909; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 5910; GFX11-NEXT: flat_load_b32 v5, v[0:1] 5911; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 5912; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 5913; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5914; GFX11-NEXT: v_not_b32_e32 v4, v4 5915; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start 5916; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5917; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5918; GFX11-NEXT: v_mov_b32_e32 v6, v5 5919; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5920; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 5921; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 5922; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5923; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 5924; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 5925; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5926; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 5927; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5928; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 5929; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5930; GFX11-NEXT: buffer_gl1_inv 5931; GFX11-NEXT: buffer_gl0_inv 5932; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 5933; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 5934; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5935; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 5936; GFX11-NEXT: s_cbranch_execnz .LBB22_1 5937; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 5938; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5939; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 5940; GFX11-NEXT: s_setpc_b64 s[30:31] 5941; 5942; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16: 5943; GFX10: ; %bb.0: 5944; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5945; GFX10-NEXT: v_mov_b32_e32 v3, v0 5946; GFX10-NEXT: s_mov_b32 s4, 0 5947; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 5948; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 5949; GFX10-NEXT: flat_load_dword v5, v[0:1] 5950; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 5951; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 5952; GFX10-NEXT: v_not_b32_e32 v4, v4 5953; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start 5954; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5955; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5956; GFX10-NEXT: v_mov_b32_e32 v6, v5 5957; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 5958; GFX10-NEXT: v_sub_f16_e32 v5, v5, v2 5959; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 5960; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 5961; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5962; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 5963; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5964; GFX10-NEXT: buffer_gl1_inv 5965; GFX10-NEXT: buffer_gl0_inv 5966; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 5967; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5968; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5969; GFX10-NEXT: s_cbranch_execnz .LBB22_1 5970; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5971; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5972; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 5973; GFX10-NEXT: s_setpc_b64 s[30:31] 5974; 5975; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16: 5976; GFX90A: ; %bb.0: 5977; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5978; GFX90A-NEXT: v_mov_b32_e32 v3, v0 5979; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 5980; GFX90A-NEXT: flat_load_dword v4, v[0:1] 5981; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 5982; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 5983; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5984; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 5985; GFX90A-NEXT: v_not_b32_e32 v5, v5 5986; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5987; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start 5988; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5989; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5990; GFX90A-NEXT: v_mov_b32_e32 v7, v4 5991; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 5992; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 5993; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 5994; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 5995; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 5996; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5997; GFX90A-NEXT: buffer_wbinvl1 5998; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 5999; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6000; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6001; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 6002; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6003; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6004; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6005; GFX90A-NEXT: s_setpc_b64 s[30:31] 6006; 6007; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16: 6008; GFX908: ; %bb.0: 6009; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6010; GFX908-NEXT: v_mov_b32_e32 v3, v0 6011; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6012; GFX908-NEXT: flat_load_dword v4, v[0:1] 6013; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6014; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6015; GFX908-NEXT: s_mov_b32 s4, 0xffff 6016; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 6017; GFX908-NEXT: v_not_b32_e32 v5, v5 6018; GFX908-NEXT: s_mov_b64 s[4:5], 0 6019; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start 6020; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6021; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6022; GFX908-NEXT: v_mov_b32_e32 v7, v4 6023; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6024; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 6025; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6026; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 6027; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 6028; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6029; GFX908-NEXT: buffer_wbinvl1 6030; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6031; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6032; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6033; GFX908-NEXT: s_cbranch_execnz .LBB22_1 6034; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6035; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6036; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6037; GFX908-NEXT: s_setpc_b64 s[30:31] 6038; 6039; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16: 6040; GFX8: ; %bb.0: 6041; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6042; GFX8-NEXT: v_mov_b32_e32 v3, v0 6043; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6044; GFX8-NEXT: flat_load_dword v5, v[0:1] 6045; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6046; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6047; GFX8-NEXT: s_mov_b32 s4, 0xffff 6048; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6049; GFX8-NEXT: v_not_b32_e32 v4, v4 6050; GFX8-NEXT: s_mov_b64 s[4:5], 0 6051; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start 6052; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6053; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6054; GFX8-NEXT: v_mov_b32_e32 v6, v5 6055; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6056; GFX8-NEXT: v_sub_f16_e32 v5, v5, v2 6057; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6058; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6059; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6060; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6061; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6062; GFX8-NEXT: buffer_wbinvl1 6063; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6064; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6065; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6066; GFX8-NEXT: s_cbranch_execnz .LBB22_1 6067; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6068; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6069; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6070; GFX8-NEXT: s_setpc_b64 s[30:31] 6071; 6072; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16: 6073; GFX7: ; %bb.0: 6074; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6075; GFX7-NEXT: v_mov_b32_e32 v3, v0 6076; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6077; GFX7-NEXT: flat_load_dword v5, v[0:1] 6078; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6079; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6080; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6081; GFX7-NEXT: s_mov_b64 s[4:5], 0 6082; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6083; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6084; GFX7-NEXT: v_not_b32_e32 v4, v4 6085; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start 6086; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6087; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6088; GFX7-NEXT: v_mov_b32_e32 v6, v5 6089; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6090; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6091; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6092; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 6093; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6094; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6095; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6096; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6097; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6098; GFX7-NEXT: buffer_wbinvl1 6099; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6100; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6101; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6102; GFX7-NEXT: s_cbranch_execnz .LBB22_1 6103; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6104; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6105; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6106; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6107; GFX7-NEXT: s_setpc_b64 s[30:31] 6108 %result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst 6109 ret half %result 6110} 6111 6112define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 { 6113; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6114; GFX12: ; %bb.0: 6115; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6116; GFX12-NEXT: s_wait_expcnt 0x0 6117; GFX12-NEXT: s_wait_samplecnt 0x0 6118; GFX12-NEXT: s_wait_bvhcnt 0x0 6119; GFX12-NEXT: s_wait_kmcnt 0x0 6120; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6121; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6122; GFX12-NEXT: s_mov_b32 s0, 0 6123; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6124; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6125; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6126; GFX12-NEXT: flat_load_b32 v5, v[0:1] 6127; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6128; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6129; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6130; GFX12-NEXT: v_not_b32_e32 v4, v4 6131; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start 6132; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6133; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6134; GFX12-NEXT: v_mov_b32_e32 v6, v5 6135; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6136; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6137; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 6138; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6139; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 6140; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6141; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6142; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 6143; GFX12-NEXT: s_wait_storecnt 0x0 6144; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6145; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6146; GFX12-NEXT: global_inv scope:SCOPE_DEV 6147; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6148; GFX12-NEXT: s_wait_alu 0xfffe 6149; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6150; GFX12-NEXT: s_wait_alu 0xfffe 6151; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6152; GFX12-NEXT: s_cbranch_execnz .LBB23_1 6153; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6154; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6155; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6156; GFX12-NEXT: s_wait_alu 0xfffe 6157; GFX12-NEXT: s_setpc_b64 s[30:31] 6158; 6159; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6160; GFX940: ; %bb.0: 6161; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6162; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 6163; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 6164; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 6165; GFX940-NEXT: v_mov_b32_e32 v1, v7 6166; GFX940-NEXT: flat_load_dword v4, v[0:1] 6167; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 6168; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6169; GFX940-NEXT: s_mov_b32 s0, 0xffff 6170; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 6171; GFX940-NEXT: v_not_b32_e32 v5, v5 6172; GFX940-NEXT: s_mov_b64 s[0:1], 0 6173; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start 6174; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6175; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6176; GFX940-NEXT: v_mov_b32_e32 v7, v4 6177; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6178; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 6179; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6180; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 6181; GFX940-NEXT: buffer_wbl2 sc1 6182; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 6183; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6184; GFX940-NEXT: buffer_inv sc1 6185; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6186; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6187; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6188; GFX940-NEXT: s_cbranch_execnz .LBB23_1 6189; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6190; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6191; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6192; GFX940-NEXT: s_setpc_b64 s[30:31] 6193; 6194; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6195; GFX11: ; %bb.0: 6196; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6197; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6198; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6199; GFX11-NEXT: s_mov_b32 s0, 0 6200; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6201; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6202; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6203; GFX11-NEXT: flat_load_b32 v5, v[0:1] 6204; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6205; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6207; GFX11-NEXT: v_not_b32_e32 v4, v4 6208; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start 6209; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6210; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6211; GFX11-NEXT: v_mov_b32_e32 v6, v5 6212; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6213; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6214; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 6215; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6216; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 6217; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6218; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6219; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 6220; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6221; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 6222; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6223; GFX11-NEXT: buffer_gl1_inv 6224; GFX11-NEXT: buffer_gl0_inv 6225; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6226; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6227; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6228; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6229; GFX11-NEXT: s_cbranch_execnz .LBB23_1 6230; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6231; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6232; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6233; GFX11-NEXT: s_setpc_b64 s[30:31] 6234; 6235; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6236; GFX10: ; %bb.0: 6237; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6238; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6239; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6240; GFX10-NEXT: s_mov_b32 s4, 0 6241; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6242; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6243; GFX10-NEXT: flat_load_dword v5, v[0:1] 6244; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6245; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6246; GFX10-NEXT: v_not_b32_e32 v4, v4 6247; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start 6248; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6249; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6250; GFX10-NEXT: v_mov_b32_e32 v6, v5 6251; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6252; GFX10-NEXT: v_sub_f16_e32 v5, v5, v2 6253; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6254; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 6255; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6256; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6257; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6258; GFX10-NEXT: buffer_gl1_inv 6259; GFX10-NEXT: buffer_gl0_inv 6260; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6261; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6262; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6263; GFX10-NEXT: s_cbranch_execnz .LBB23_1 6264; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6265; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6266; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6267; GFX10-NEXT: s_setpc_b64 s[30:31] 6268; 6269; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6270; GFX90A: ; %bb.0: 6271; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6272; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 6273; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6274; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6275; GFX90A-NEXT: flat_load_dword v4, v[0:1] 6276; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6277; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6278; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6279; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 6280; GFX90A-NEXT: v_not_b32_e32 v5, v5 6281; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6282; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start 6283; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6284; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6285; GFX90A-NEXT: v_mov_b32_e32 v7, v4 6286; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6287; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 6288; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6289; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 6290; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 6291; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6292; GFX90A-NEXT: buffer_wbinvl1 6293; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6294; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6295; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6296; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 6297; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6298; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6299; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6300; GFX90A-NEXT: s_setpc_b64 s[30:31] 6301; 6302; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6303; GFX908: ; %bb.0: 6304; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6305; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 6306; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6307; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6308; GFX908-NEXT: flat_load_dword v4, v[0:1] 6309; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6310; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6311; GFX908-NEXT: s_mov_b32 s4, 0xffff 6312; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 6313; GFX908-NEXT: v_not_b32_e32 v5, v5 6314; GFX908-NEXT: s_mov_b64 s[4:5], 0 6315; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start 6316; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6317; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6318; GFX908-NEXT: v_mov_b32_e32 v7, v4 6319; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6320; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 6321; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6322; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 6323; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 6324; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6325; GFX908-NEXT: buffer_wbinvl1 6326; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6327; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6328; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6329; GFX908-NEXT: s_cbranch_execnz .LBB23_1 6330; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6331; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6332; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6333; GFX908-NEXT: s_setpc_b64 s[30:31] 6334; 6335; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6336; GFX8: ; %bb.0: 6337; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6338; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 6339; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6340; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6341; GFX8-NEXT: flat_load_dword v5, v[0:1] 6342; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6343; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6344; GFX8-NEXT: s_mov_b32 s4, 0xffff 6345; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6346; GFX8-NEXT: v_not_b32_e32 v4, v4 6347; GFX8-NEXT: s_mov_b64 s[4:5], 0 6348; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start 6349; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6350; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6351; GFX8-NEXT: v_mov_b32_e32 v6, v5 6352; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6353; GFX8-NEXT: v_sub_f16_e32 v5, v5, v2 6354; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6355; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6356; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6357; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6358; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6359; GFX8-NEXT: buffer_wbinvl1 6360; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6361; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6362; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6363; GFX8-NEXT: s_cbranch_execnz .LBB23_1 6364; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6365; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6366; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6367; GFX8-NEXT: s_setpc_b64 s[30:31] 6368; 6369; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: 6370; GFX7: ; %bb.0: 6371; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6372; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 6373; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6374; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6375; GFX7-NEXT: flat_load_dword v5, v[0:1] 6376; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6377; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6378; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6379; GFX7-NEXT: s_mov_b64 s[4:5], 0 6380; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6381; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6382; GFX7-NEXT: v_not_b32_e32 v4, v4 6383; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start 6384; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6385; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6386; GFX7-NEXT: v_mov_b32_e32 v6, v5 6387; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6388; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6389; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6390; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 6391; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6392; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6393; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6394; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6395; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6396; GFX7-NEXT: buffer_wbinvl1 6397; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6398; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6399; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6400; GFX7-NEXT: s_cbranch_execnz .LBB23_1 6401; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6402; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6403; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6404; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6405; GFX7-NEXT: s_setpc_b64 s[30:31] 6406 %gep = getelementptr half, ptr %ptr, i64 1023 6407 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst 6408 ret half %result 6409} 6410 6411define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 { 6412; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6413; GFX12: ; %bb.0: 6414; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6415; GFX12-NEXT: s_wait_expcnt 0x0 6416; GFX12-NEXT: s_wait_samplecnt 0x0 6417; GFX12-NEXT: s_wait_bvhcnt 0x0 6418; GFX12-NEXT: s_wait_kmcnt 0x0 6419; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6420; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6421; GFX12-NEXT: s_mov_b32 s0, 0 6422; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6423; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6424; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6425; GFX12-NEXT: flat_load_b32 v5, v[0:1] 6426; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6427; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6428; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6429; GFX12-NEXT: v_not_b32_e32 v4, v4 6430; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start 6431; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6432; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6433; GFX12-NEXT: v_mov_b32_e32 v6, v5 6434; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6435; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6436; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 6437; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6438; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 6439; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6440; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6441; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 6442; GFX12-NEXT: s_wait_storecnt 0x0 6443; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6444; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6445; GFX12-NEXT: global_inv scope:SCOPE_DEV 6446; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6447; GFX12-NEXT: s_wait_alu 0xfffe 6448; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6449; GFX12-NEXT: s_wait_alu 0xfffe 6450; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6451; GFX12-NEXT: s_cbranch_execnz .LBB24_1 6452; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6453; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6454; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6455; GFX12-NEXT: s_wait_alu 0xfffe 6456; GFX12-NEXT: s_setpc_b64 s[30:31] 6457; 6458; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6459; GFX940: ; %bb.0: 6460; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6461; GFX940-NEXT: s_movk_i32 s0, 0xf800 6462; GFX940-NEXT: s_mov_b32 s1, -1 6463; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 6464; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 6465; GFX940-NEXT: v_mov_b32_e32 v1, v7 6466; GFX940-NEXT: flat_load_dword v4, v[0:1] 6467; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 6468; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6469; GFX940-NEXT: s_mov_b32 s0, 0xffff 6470; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 6471; GFX940-NEXT: v_not_b32_e32 v5, v5 6472; GFX940-NEXT: s_mov_b64 s[0:1], 0 6473; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start 6474; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6475; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6476; GFX940-NEXT: v_mov_b32_e32 v7, v4 6477; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6478; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 6479; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6480; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 6481; GFX940-NEXT: buffer_wbl2 sc1 6482; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 6483; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6484; GFX940-NEXT: buffer_inv sc1 6485; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6486; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6487; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6488; GFX940-NEXT: s_cbranch_execnz .LBB24_1 6489; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6490; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6491; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6492; GFX940-NEXT: s_setpc_b64 s[30:31] 6493; 6494; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6495; GFX11: ; %bb.0: 6496; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6497; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6498; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6499; GFX11-NEXT: s_mov_b32 s0, 0 6500; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6501; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6502; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6503; GFX11-NEXT: flat_load_b32 v5, v[0:1] 6504; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6505; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6506; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6507; GFX11-NEXT: v_not_b32_e32 v4, v4 6508; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start 6509; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6510; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6511; GFX11-NEXT: v_mov_b32_e32 v6, v5 6512; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6513; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6514; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 6515; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6516; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 6517; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6518; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6519; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 6520; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6521; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 6522; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6523; GFX11-NEXT: buffer_gl1_inv 6524; GFX11-NEXT: buffer_gl0_inv 6525; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6526; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6527; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6528; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6529; GFX11-NEXT: s_cbranch_execnz .LBB24_1 6530; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6531; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6532; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6533; GFX11-NEXT: s_setpc_b64 s[30:31] 6534; 6535; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6536; GFX10: ; %bb.0: 6537; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6538; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6539; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6540; GFX10-NEXT: s_mov_b32 s4, 0 6541; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6542; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6543; GFX10-NEXT: flat_load_dword v5, v[0:1] 6544; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6545; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6546; GFX10-NEXT: v_not_b32_e32 v4, v4 6547; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start 6548; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6549; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6550; GFX10-NEXT: v_mov_b32_e32 v6, v5 6551; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6552; GFX10-NEXT: v_sub_f16_e32 v5, v5, v2 6553; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6554; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 6555; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6556; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6557; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6558; GFX10-NEXT: buffer_gl1_inv 6559; GFX10-NEXT: buffer_gl0_inv 6560; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6561; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6562; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6563; GFX10-NEXT: s_cbranch_execnz .LBB24_1 6564; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6565; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6566; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6567; GFX10-NEXT: s_setpc_b64 s[30:31] 6568; 6569; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6570; GFX90A: ; %bb.0: 6571; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6572; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 6573; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 6574; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6575; GFX90A-NEXT: flat_load_dword v4, v[0:1] 6576; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6577; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6578; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6579; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 6580; GFX90A-NEXT: v_not_b32_e32 v5, v5 6581; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6582; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start 6583; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6584; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6585; GFX90A-NEXT: v_mov_b32_e32 v7, v4 6586; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6587; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 6588; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6589; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 6590; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 6591; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6592; GFX90A-NEXT: buffer_wbinvl1 6593; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6594; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6595; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6596; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 6597; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6598; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6599; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6600; GFX90A-NEXT: s_setpc_b64 s[30:31] 6601; 6602; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6603; GFX908: ; %bb.0: 6604; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6605; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 6606; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 6607; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6608; GFX908-NEXT: flat_load_dword v4, v[0:1] 6609; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6610; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6611; GFX908-NEXT: s_mov_b32 s4, 0xffff 6612; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 6613; GFX908-NEXT: v_not_b32_e32 v5, v5 6614; GFX908-NEXT: s_mov_b64 s[4:5], 0 6615; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start 6616; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6617; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6618; GFX908-NEXT: v_mov_b32_e32 v7, v4 6619; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 6620; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 6621; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6622; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 6623; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 6624; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6625; GFX908-NEXT: buffer_wbinvl1 6626; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6627; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6628; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6629; GFX908-NEXT: s_cbranch_execnz .LBB24_1 6630; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6631; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6632; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 6633; GFX908-NEXT: s_setpc_b64 s[30:31] 6634; 6635; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6636; GFX8: ; %bb.0: 6637; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6638; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 6639; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 6640; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6641; GFX8-NEXT: flat_load_dword v5, v[0:1] 6642; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6643; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6644; GFX8-NEXT: s_mov_b32 s4, 0xffff 6645; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6646; GFX8-NEXT: v_not_b32_e32 v4, v4 6647; GFX8-NEXT: s_mov_b64 s[4:5], 0 6648; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start 6649; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6650; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6651; GFX8-NEXT: v_mov_b32_e32 v6, v5 6652; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6653; GFX8-NEXT: v_sub_f16_e32 v5, v5, v2 6654; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6655; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6656; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6657; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6658; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6659; GFX8-NEXT: buffer_wbinvl1 6660; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6661; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6662; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6663; GFX8-NEXT: s_cbranch_execnz .LBB24_1 6664; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6665; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6666; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6667; GFX8-NEXT: s_setpc_b64 s[30:31] 6668; 6669; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: 6670; GFX7: ; %bb.0: 6671; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6672; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 6673; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 6674; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6675; GFX7-NEXT: flat_load_dword v5, v[0:1] 6676; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6677; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6678; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6679; GFX7-NEXT: s_mov_b64 s[4:5], 0 6680; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6681; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6682; GFX7-NEXT: v_not_b32_e32 v4, v4 6683; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start 6684; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6685; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6686; GFX7-NEXT: v_mov_b32_e32 v6, v5 6687; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6688; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6689; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6690; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 6691; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6692; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6693; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6694; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6695; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6696; GFX7-NEXT: buffer_wbinvl1 6697; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6698; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6699; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6700; GFX7-NEXT: s_cbranch_execnz .LBB24_1 6701; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6702; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6703; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6704; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6705; GFX7-NEXT: s_setpc_b64 s[30:31] 6706 %gep = getelementptr half, ptr %ptr, i64 -1024 6707 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst 6708 ret half %result 6709 } 6710 6711define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { 6712; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16: 6713; GFX12: ; %bb.0: 6714; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6715; GFX12-NEXT: s_wait_expcnt 0x0 6716; GFX12-NEXT: s_wait_samplecnt 0x0 6717; GFX12-NEXT: s_wait_bvhcnt 0x0 6718; GFX12-NEXT: s_wait_kmcnt 0x0 6719; GFX12-NEXT: v_mov_b32_e32 v3, v0 6720; GFX12-NEXT: s_mov_b32 s0, 0 6721; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6722; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6723; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6724; GFX12-NEXT: flat_load_b32 v4, v[0:1] 6725; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6726; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 6727; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6728; GFX12-NEXT: v_not_b32_e32 v6, v3 6729; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start 6730; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6731; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6732; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6733; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6734; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 6735; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 6736; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6737; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 6738; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 6739; GFX12-NEXT: s_wait_storecnt 0x0 6740; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6741; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6742; GFX12-NEXT: global_inv scope:SCOPE_DEV 6743; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 6744; GFX12-NEXT: v_mov_b32_e32 v4, v3 6745; GFX12-NEXT: s_wait_alu 0xfffe 6746; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6747; GFX12-NEXT: s_wait_alu 0xfffe 6748; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6749; GFX12-NEXT: s_cbranch_execnz .LBB25_1 6750; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6751; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6752; GFX12-NEXT: s_wait_alu 0xfffe 6753; GFX12-NEXT: s_setpc_b64 s[30:31] 6754; 6755; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: 6756; GFX940: ; %bb.0: 6757; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6758; GFX940-NEXT: v_mov_b32_e32 v3, v0 6759; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 6760; GFX940-NEXT: flat_load_dword v5, v[0:1] 6761; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 6762; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6763; GFX940-NEXT: s_mov_b32 s0, 0xffff 6764; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 6765; GFX940-NEXT: v_not_b32_e32 v6, v4 6766; GFX940-NEXT: s_mov_b64 s[0:1], 0 6767; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start 6768; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6769; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6770; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 6771; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 6772; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6773; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 6774; GFX940-NEXT: buffer_wbl2 sc1 6775; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 6776; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6777; GFX940-NEXT: buffer_inv sc1 6778; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 6779; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6780; GFX940-NEXT: v_mov_b32_e32 v5, v4 6781; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6782; GFX940-NEXT: s_cbranch_execnz .LBB25_1 6783; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6784; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6785; GFX940-NEXT: s_setpc_b64 s[30:31] 6786; 6787; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16: 6788; GFX11: ; %bb.0: 6789; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6790; GFX11-NEXT: v_mov_b32_e32 v3, v0 6791; GFX11-NEXT: s_mov_b32 s0, 0 6792; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6793; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6794; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6795; GFX11-NEXT: flat_load_b32 v4, v[0:1] 6796; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6797; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 6798; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6799; GFX11-NEXT: v_not_b32_e32 v6, v3 6800; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start 6801; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6802; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6803; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6804; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6805; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 6806; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 6807; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6808; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 6809; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 6810; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6811; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 6812; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6813; GFX11-NEXT: buffer_gl1_inv 6814; GFX11-NEXT: buffer_gl0_inv 6815; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 6816; GFX11-NEXT: v_mov_b32_e32 v4, v3 6817; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6818; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6819; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6820; GFX11-NEXT: s_cbranch_execnz .LBB25_1 6821; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6822; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6823; GFX11-NEXT: s_setpc_b64 s[30:31] 6824; 6825; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16: 6826; GFX10: ; %bb.0: 6827; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6828; GFX10-NEXT: v_mov_b32_e32 v3, v0 6829; GFX10-NEXT: s_mov_b32 s4, 0 6830; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6831; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6832; GFX10-NEXT: flat_load_dword v4, v[0:1] 6833; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6834; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 6835; GFX10-NEXT: v_not_b32_e32 v6, v3 6836; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start 6837; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6838; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6839; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6840; GFX10-NEXT: v_sub_f16_e32 v3, v3, v2 6841; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6842; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 6843; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6844; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6845; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6846; GFX10-NEXT: buffer_gl1_inv 6847; GFX10-NEXT: buffer_gl0_inv 6848; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 6849; GFX10-NEXT: v_mov_b32_e32 v4, v3 6850; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6851; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6852; GFX10-NEXT: s_cbranch_execnz .LBB25_1 6853; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6854; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6855; GFX10-NEXT: s_setpc_b64 s[30:31] 6856; 6857; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16: 6858; GFX90A: ; %bb.0: 6859; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6860; GFX90A-NEXT: v_mov_b32_e32 v3, v0 6861; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6862; GFX90A-NEXT: flat_load_dword v5, v[0:1] 6863; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6864; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6865; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6866; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6867; GFX90A-NEXT: v_not_b32_e32 v6, v4 6868; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6869; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start 6870; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6871; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6872; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 6873; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 6874; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6875; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 6876; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 6877; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6878; GFX90A-NEXT: buffer_wbinvl1 6879; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 6880; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6881; GFX90A-NEXT: v_mov_b32_e32 v5, v4 6882; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6883; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 6884; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6885; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6886; GFX90A-NEXT: s_setpc_b64 s[30:31] 6887; 6888; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16: 6889; GFX908: ; %bb.0: 6890; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6891; GFX908-NEXT: v_mov_b32_e32 v3, v0 6892; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6893; GFX908-NEXT: flat_load_dword v4, v[0:1] 6894; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6895; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6896; GFX908-NEXT: s_mov_b32 s4, 0xffff 6897; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 6898; GFX908-NEXT: v_not_b32_e32 v6, v3 6899; GFX908-NEXT: s_mov_b64 s[4:5], 0 6900; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start 6901; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6902; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6903; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6904; GFX908-NEXT: v_sub_f16_e32 v3, v3, v2 6905; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 6906; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 6907; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6908; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6909; GFX908-NEXT: buffer_wbinvl1 6910; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6911; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6912; GFX908-NEXT: v_mov_b32_e32 v4, v3 6913; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6914; GFX908-NEXT: s_cbranch_execnz .LBB25_1 6915; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6916; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6917; GFX908-NEXT: s_setpc_b64 s[30:31] 6918; 6919; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16: 6920; GFX8: ; %bb.0: 6921; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6922; GFX8-NEXT: v_mov_b32_e32 v3, v0 6923; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6924; GFX8-NEXT: flat_load_dword v4, v[0:1] 6925; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6926; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6927; GFX8-NEXT: s_mov_b32 s4, 0xffff 6928; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 6929; GFX8-NEXT: v_not_b32_e32 v6, v3 6930; GFX8-NEXT: s_mov_b64 s[4:5], 0 6931; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start 6932; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6933; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6934; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6935; GFX8-NEXT: v_sub_f16_e32 v3, v3, v2 6936; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 6937; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 6938; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 6939; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6940; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6941; GFX8-NEXT: buffer_wbinvl1 6942; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6943; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6944; GFX8-NEXT: v_mov_b32_e32 v4, v3 6945; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6946; GFX8-NEXT: s_cbranch_execnz .LBB25_1 6947; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6948; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6949; GFX8-NEXT: s_setpc_b64 s[30:31] 6950; 6951; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16: 6952; GFX7: ; %bb.0: 6953; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6954; GFX7-NEXT: v_mov_b32_e32 v3, v0 6955; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6956; GFX7-NEXT: flat_load_dword v4, v[0:1] 6957; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 6958; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6959; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6960; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 6961; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6962; GFX7-NEXT: v_not_b32_e32 v6, v3 6963; GFX7-NEXT: s_mov_b64 s[4:5], 0 6964; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start 6965; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6966; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6967; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 6968; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 6969; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 6970; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 6971; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 6972; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 6973; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 6974; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 6975; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6976; GFX7-NEXT: buffer_wbinvl1 6977; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 6978; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6979; GFX7-NEXT: v_mov_b32_e32 v4, v3 6980; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6981; GFX7-NEXT: s_cbranch_execnz .LBB25_1 6982; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6983; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6984; GFX7-NEXT: s_setpc_b64 s[30:31] 6985 %unused = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst 6986 ret void 6987} 6988 6989define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 { 6990; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 6991; GFX12: ; %bb.0: 6992; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6993; GFX12-NEXT: s_wait_expcnt 0x0 6994; GFX12-NEXT: s_wait_samplecnt 0x0 6995; GFX12-NEXT: s_wait_bvhcnt 0x0 6996; GFX12-NEXT: s_wait_kmcnt 0x0 6997; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6998; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6999; GFX12-NEXT: s_mov_b32 s0, 0 7000; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 7001; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 7002; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 7003; GFX12-NEXT: flat_load_b32 v4, v[0:1] 7004; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7005; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7006; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7007; GFX12-NEXT: v_not_b32_e32 v6, v3 7008; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start 7009; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7010; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7011; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7012; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7013; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 7014; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 7015; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7016; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7017; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 7018; GFX12-NEXT: s_wait_storecnt 0x0 7019; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7020; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7021; GFX12-NEXT: global_inv scope:SCOPE_DEV 7022; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7023; GFX12-NEXT: v_mov_b32_e32 v4, v3 7024; GFX12-NEXT: s_wait_alu 0xfffe 7025; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7026; GFX12-NEXT: s_wait_alu 0xfffe 7027; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7028; GFX12-NEXT: s_cbranch_execnz .LBB26_1 7029; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7030; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7031; GFX12-NEXT: s_wait_alu 0xfffe 7032; GFX12-NEXT: s_setpc_b64 s[30:31] 7033; 7034; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7035; GFX940: ; %bb.0: 7036; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7037; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 7038; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 7039; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 7040; GFX940-NEXT: v_mov_b32_e32 v1, v5 7041; GFX940-NEXT: flat_load_dword v5, v[0:1] 7042; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 7043; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 7044; GFX940-NEXT: s_mov_b32 s0, 0xffff 7045; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 7046; GFX940-NEXT: v_not_b32_e32 v6, v4 7047; GFX940-NEXT: s_mov_b64 s[0:1], 0 7048; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start 7049; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7050; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7051; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 7052; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 7053; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 7054; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 7055; GFX940-NEXT: buffer_wbl2 sc1 7056; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 7057; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7058; GFX940-NEXT: buffer_inv sc1 7059; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 7060; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7061; GFX940-NEXT: v_mov_b32_e32 v5, v4 7062; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7063; GFX940-NEXT: s_cbranch_execnz .LBB26_1 7064; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7065; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7066; GFX940-NEXT: s_setpc_b64 s[30:31] 7067; 7068; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7069; GFX11: ; %bb.0: 7070; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7071; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 7072; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7073; GFX11-NEXT: s_mov_b32 s0, 0 7074; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 7075; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 7076; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 7077; GFX11-NEXT: flat_load_b32 v4, v[0:1] 7078; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7079; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7080; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7081; GFX11-NEXT: v_not_b32_e32 v6, v3 7082; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start 7083; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7084; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7085; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7086; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7087; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 7088; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7089; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7090; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7091; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 7092; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7093; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 7094; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7095; GFX11-NEXT: buffer_gl1_inv 7096; GFX11-NEXT: buffer_gl0_inv 7097; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7098; GFX11-NEXT: v_mov_b32_e32 v4, v3 7099; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7100; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7101; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7102; GFX11-NEXT: s_cbranch_execnz .LBB26_1 7103; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7104; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7105; GFX11-NEXT: s_setpc_b64 s[30:31] 7106; 7107; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7108; GFX10: ; %bb.0: 7109; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7110; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 7111; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7112; GFX10-NEXT: s_mov_b32 s4, 0 7113; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 7114; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 7115; GFX10-NEXT: flat_load_dword v4, v[0:1] 7116; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7117; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7118; GFX10-NEXT: v_not_b32_e32 v6, v3 7119; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start 7120; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7121; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7122; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7123; GFX10-NEXT: v_sub_f16_e32 v3, v3, v2 7124; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 7125; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 7126; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7127; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7128; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7129; GFX10-NEXT: buffer_gl1_inv 7130; GFX10-NEXT: buffer_gl0_inv 7131; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7132; GFX10-NEXT: v_mov_b32_e32 v4, v3 7133; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7134; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7135; GFX10-NEXT: s_cbranch_execnz .LBB26_1 7136; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7137; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7138; GFX10-NEXT: s_setpc_b64 s[30:31] 7139; 7140; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7141; GFX90A: ; %bb.0: 7142; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7143; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 7144; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7145; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 7146; GFX90A-NEXT: flat_load_dword v5, v[0:1] 7147; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 7148; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 7149; GFX90A-NEXT: s_mov_b32 s4, 0xffff 7150; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 7151; GFX90A-NEXT: v_not_b32_e32 v6, v4 7152; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7153; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start 7154; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7155; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7156; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 7157; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 7158; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 7159; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 7160; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 7161; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7162; GFX90A-NEXT: buffer_wbinvl1 7163; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 7164; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7165; GFX90A-NEXT: v_mov_b32_e32 v5, v4 7166; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7167; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 7168; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7169; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7170; GFX90A-NEXT: s_setpc_b64 s[30:31] 7171; 7172; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7173; GFX908: ; %bb.0: 7174; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7175; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 7176; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7177; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 7178; GFX908-NEXT: flat_load_dword v4, v[0:1] 7179; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 7180; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7181; GFX908-NEXT: s_mov_b32 s4, 0xffff 7182; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7183; GFX908-NEXT: v_not_b32_e32 v6, v3 7184; GFX908-NEXT: s_mov_b64 s[4:5], 0 7185; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start 7186; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7187; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7188; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7189; GFX908-NEXT: v_sub_f16_e32 v3, v3, v2 7190; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7191; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 7192; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7193; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7194; GFX908-NEXT: buffer_wbinvl1 7195; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7196; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7197; GFX908-NEXT: v_mov_b32_e32 v4, v3 7198; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7199; GFX908-NEXT: s_cbranch_execnz .LBB26_1 7200; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7201; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7202; GFX908-NEXT: s_setpc_b64 s[30:31] 7203; 7204; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7205; GFX8: ; %bb.0: 7206; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7207; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 7208; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7209; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 7210; GFX8-NEXT: flat_load_dword v4, v[0:1] 7211; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 7212; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7213; GFX8-NEXT: s_mov_b32 s4, 0xffff 7214; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7215; GFX8-NEXT: v_not_b32_e32 v6, v3 7216; GFX8-NEXT: s_mov_b64 s[4:5], 0 7217; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start 7218; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7219; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7220; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7221; GFX8-NEXT: v_sub_f16_e32 v3, v3, v2 7222; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 7223; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7224; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 7225; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7226; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7227; GFX8-NEXT: buffer_wbinvl1 7228; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7229; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7230; GFX8-NEXT: v_mov_b32_e32 v4, v3 7231; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7232; GFX8-NEXT: s_cbranch_execnz .LBB26_1 7233; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7234; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7235; GFX8-NEXT: s_setpc_b64 s[30:31] 7236; 7237; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: 7238; GFX7: ; %bb.0: 7239; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7240; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 7241; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7242; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 7243; GFX7-NEXT: flat_load_dword v3, v[0:1] 7244; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7245; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 7246; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7247; GFX7-NEXT: s_mov_b64 s[4:5], 0 7248; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 7249; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 7250; GFX7-NEXT: v_not_b32_e32 v6, v2 7251; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start 7252; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7253; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7254; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7255; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 7256; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 7257; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 7258; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7259; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7260; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 7261; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7262; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7263; GFX7-NEXT: buffer_wbinvl1 7264; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7265; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7266; GFX7-NEXT: v_mov_b32_e32 v3, v2 7267; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7268; GFX7-NEXT: s_cbranch_execnz .LBB26_1 7269; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7270; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7271; GFX7-NEXT: s_setpc_b64 s[30:31] 7272 %gep = getelementptr half, ptr %ptr, i64 1023 7273 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst 7274 ret void 7275} 7276 7277define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 { 7278; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7279; GFX12: ; %bb.0: 7280; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7281; GFX12-NEXT: s_wait_expcnt 0x0 7282; GFX12-NEXT: s_wait_samplecnt 0x0 7283; GFX12-NEXT: s_wait_bvhcnt 0x0 7284; GFX12-NEXT: s_wait_kmcnt 0x0 7285; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 7286; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7287; GFX12-NEXT: s_mov_b32 s0, 0 7288; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 7289; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 7290; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 7291; GFX12-NEXT: flat_load_b32 v4, v[0:1] 7292; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7293; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7294; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7295; GFX12-NEXT: v_not_b32_e32 v6, v3 7296; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start 7297; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7298; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7299; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7300; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7301; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 7302; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 7303; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7304; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7305; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 7306; GFX12-NEXT: s_wait_storecnt 0x0 7307; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7308; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7309; GFX12-NEXT: global_inv scope:SCOPE_DEV 7310; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7311; GFX12-NEXT: v_mov_b32_e32 v4, v3 7312; GFX12-NEXT: s_wait_alu 0xfffe 7313; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7314; GFX12-NEXT: s_wait_alu 0xfffe 7315; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7316; GFX12-NEXT: s_cbranch_execnz .LBB27_1 7317; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7318; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7319; GFX12-NEXT: s_wait_alu 0xfffe 7320; GFX12-NEXT: s_setpc_b64 s[30:31] 7321; 7322; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7323; GFX940: ; %bb.0: 7324; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7325; GFX940-NEXT: s_movk_i32 s0, 0xf800 7326; GFX940-NEXT: s_mov_b32 s1, -1 7327; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 7328; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 7329; GFX940-NEXT: v_mov_b32_e32 v1, v5 7330; GFX940-NEXT: flat_load_dword v5, v[0:1] 7331; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 7332; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 7333; GFX940-NEXT: s_mov_b32 s0, 0xffff 7334; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 7335; GFX940-NEXT: v_not_b32_e32 v6, v4 7336; GFX940-NEXT: s_mov_b64 s[0:1], 0 7337; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start 7338; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7339; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7340; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 7341; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 7342; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 7343; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 7344; GFX940-NEXT: buffer_wbl2 sc1 7345; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 7346; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7347; GFX940-NEXT: buffer_inv sc1 7348; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 7349; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7350; GFX940-NEXT: v_mov_b32_e32 v5, v4 7351; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7352; GFX940-NEXT: s_cbranch_execnz .LBB27_1 7353; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7354; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7355; GFX940-NEXT: s_setpc_b64 s[30:31] 7356; 7357; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7358; GFX11: ; %bb.0: 7359; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7360; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 7361; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7362; GFX11-NEXT: s_mov_b32 s0, 0 7363; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 7364; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 7365; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 7366; GFX11-NEXT: flat_load_b32 v4, v[0:1] 7367; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7368; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7369; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7370; GFX11-NEXT: v_not_b32_e32 v6, v3 7371; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start 7372; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7373; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7374; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7376; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 7377; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7379; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7380; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 7381; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7382; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 7383; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7384; GFX11-NEXT: buffer_gl1_inv 7385; GFX11-NEXT: buffer_gl0_inv 7386; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7387; GFX11-NEXT: v_mov_b32_e32 v4, v3 7388; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7389; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7390; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7391; GFX11-NEXT: s_cbranch_execnz .LBB27_1 7392; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7393; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7394; GFX11-NEXT: s_setpc_b64 s[30:31] 7395; 7396; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7397; GFX10: ; %bb.0: 7398; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7399; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 7400; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7401; GFX10-NEXT: s_mov_b32 s4, 0 7402; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 7403; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 7404; GFX10-NEXT: flat_load_dword v4, v[0:1] 7405; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7406; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7407; GFX10-NEXT: v_not_b32_e32 v6, v3 7408; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start 7409; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7410; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7411; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7412; GFX10-NEXT: v_sub_f16_e32 v3, v3, v2 7413; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 7414; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 7415; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7416; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7417; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7418; GFX10-NEXT: buffer_gl1_inv 7419; GFX10-NEXT: buffer_gl0_inv 7420; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7421; GFX10-NEXT: v_mov_b32_e32 v4, v3 7422; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7423; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7424; GFX10-NEXT: s_cbranch_execnz .LBB27_1 7425; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7426; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7427; GFX10-NEXT: s_setpc_b64 s[30:31] 7428; 7429; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7430; GFX90A: ; %bb.0: 7431; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7432; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 7433; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7434; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 7435; GFX90A-NEXT: flat_load_dword v5, v[0:1] 7436; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 7437; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 7438; GFX90A-NEXT: s_mov_b32 s4, 0xffff 7439; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 7440; GFX90A-NEXT: v_not_b32_e32 v6, v4 7441; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7442; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start 7443; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7444; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7445; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 7446; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 7447; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 7448; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 7449; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 7450; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7451; GFX90A-NEXT: buffer_wbinvl1 7452; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 7453; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7454; GFX90A-NEXT: v_mov_b32_e32 v5, v4 7455; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7456; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 7457; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7458; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7459; GFX90A-NEXT: s_setpc_b64 s[30:31] 7460; 7461; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7462; GFX908: ; %bb.0: 7463; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7464; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 7465; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7466; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 7467; GFX908-NEXT: flat_load_dword v4, v[0:1] 7468; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 7469; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7470; GFX908-NEXT: s_mov_b32 s4, 0xffff 7471; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7472; GFX908-NEXT: v_not_b32_e32 v6, v3 7473; GFX908-NEXT: s_mov_b64 s[4:5], 0 7474; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start 7475; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7476; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7477; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7478; GFX908-NEXT: v_sub_f16_e32 v3, v3, v2 7479; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7480; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 7481; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7482; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7483; GFX908-NEXT: buffer_wbinvl1 7484; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7485; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7486; GFX908-NEXT: v_mov_b32_e32 v4, v3 7487; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7488; GFX908-NEXT: s_cbranch_execnz .LBB27_1 7489; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7490; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7491; GFX908-NEXT: s_setpc_b64 s[30:31] 7492; 7493; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7494; GFX8: ; %bb.0: 7495; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7496; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 7497; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 7498; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 7499; GFX8-NEXT: flat_load_dword v4, v[0:1] 7500; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 7501; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7502; GFX8-NEXT: s_mov_b32 s4, 0xffff 7503; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7504; GFX8-NEXT: v_not_b32_e32 v6, v3 7505; GFX8-NEXT: s_mov_b64 s[4:5], 0 7506; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start 7507; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7508; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7509; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7510; GFX8-NEXT: v_sub_f16_e32 v3, v3, v2 7511; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 7512; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7513; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 7514; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7515; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7516; GFX8-NEXT: buffer_wbinvl1 7517; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7518; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7519; GFX8-NEXT: v_mov_b32_e32 v4, v3 7520; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7521; GFX8-NEXT: s_cbranch_execnz .LBB27_1 7522; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7523; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7524; GFX8-NEXT: s_setpc_b64 s[30:31] 7525; 7526; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: 7527; GFX7: ; %bb.0: 7528; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7529; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 7530; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 7531; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 7532; GFX7-NEXT: flat_load_dword v3, v[0:1] 7533; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7534; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 7535; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7536; GFX7-NEXT: s_mov_b64 s[4:5], 0 7537; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 7538; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 7539; GFX7-NEXT: v_not_b32_e32 v6, v2 7540; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start 7541; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7542; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7543; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7544; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 7545; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 7546; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 7547; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7548; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7549; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 7550; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7551; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7552; GFX7-NEXT: buffer_wbinvl1 7553; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7554; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7555; GFX7-NEXT: v_mov_b32_e32 v3, v2 7556; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7557; GFX7-NEXT: s_cbranch_execnz .LBB27_1 7558; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7559; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7560; GFX7-NEXT: s_setpc_b64 s[30:31] 7561 %gep = getelementptr half, ptr %ptr, i64 -1024 7562 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst 7563 ret void 7564} 7565 7566define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 { 7567; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7568; GFX12: ; %bb.0: 7569; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7570; GFX12-NEXT: s_wait_expcnt 0x0 7571; GFX12-NEXT: s_wait_samplecnt 0x0 7572; GFX12-NEXT: s_wait_bvhcnt 0x0 7573; GFX12-NEXT: s_wait_kmcnt 0x0 7574; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 7575; GFX12-NEXT: s_mov_b32 s0, 0 7576; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start 7577; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7578; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7579; GFX12-NEXT: v_mov_b32_e32 v4, v3 7580; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7581; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 7582; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 7583; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7584; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7585; GFX12-NEXT: s_wait_storecnt 0x0 7586; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7587; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7588; GFX12-NEXT: global_inv scope:SCOPE_DEV 7589; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7590; GFX12-NEXT: s_wait_alu 0xfffe 7591; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7592; GFX12-NEXT: s_wait_alu 0xfffe 7593; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7594; GFX12-NEXT: s_cbranch_execnz .LBB28_1 7595; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7596; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7597; GFX12-NEXT: v_mov_b32_e32 v0, v3 7598; GFX12-NEXT: s_wait_alu 0xfffe 7599; GFX12-NEXT: s_setpc_b64 s[30:31] 7600; 7601; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7602; GFX940: ; %bb.0: 7603; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7604; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 7605; GFX940-NEXT: s_mov_b64 s[0:1], 0 7606; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 7607; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start 7608; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7609; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7610; GFX940-NEXT: v_mov_b32_e32 v5, v3 7611; GFX940-NEXT: v_sub_f16_e32 v3, v5, v2 7612; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 7613; GFX940-NEXT: buffer_wbl2 sc1 7614; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 7615; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7616; GFX940-NEXT: buffer_inv sc1 7617; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7618; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7619; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7620; GFX940-NEXT: s_cbranch_execnz .LBB28_1 7621; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7622; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7623; GFX940-NEXT: v_mov_b32_e32 v0, v3 7624; GFX940-NEXT: s_setpc_b64 s[30:31] 7625; 7626; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7627; GFX11: ; %bb.0: 7628; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7629; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 7630; GFX11-NEXT: s_mov_b32 s0, 0 7631; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start 7632; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7633; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7634; GFX11-NEXT: v_mov_b32_e32 v4, v3 7635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7636; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 7637; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7638; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7639; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7640; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7641; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 7642; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7643; GFX11-NEXT: buffer_gl1_inv 7644; GFX11-NEXT: buffer_gl0_inv 7645; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7646; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7647; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7648; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7649; GFX11-NEXT: s_cbranch_execnz .LBB28_1 7650; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7651; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7652; GFX11-NEXT: v_mov_b32_e32 v0, v3 7653; GFX11-NEXT: s_setpc_b64 s[30:31] 7654; 7655; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7656; GFX10: ; %bb.0: 7657; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7658; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 7659; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 7660; GFX10-NEXT: s_mov_b32 s4, 0 7661; GFX10-NEXT: flat_load_dword v0, v[3:4] 7662; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start 7663; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7664; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7665; GFX10-NEXT: v_mov_b32_e32 v1, v0 7666; GFX10-NEXT: v_sub_f16_e32 v0, v1, v2 7667; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 7668; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 7669; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7670; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7671; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7672; GFX10-NEXT: buffer_gl1_inv 7673; GFX10-NEXT: buffer_gl0_inv 7674; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 7675; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7676; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7677; GFX10-NEXT: s_cbranch_execnz .LBB28_1 7678; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7679; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7680; GFX10-NEXT: s_setpc_b64 s[30:31] 7681; 7682; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7683; GFX90A: ; %bb.0: 7684; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7685; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 7686; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7687; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 7688; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start 7689; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7690; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7691; GFX90A-NEXT: v_mov_b32_e32 v5, v3 7692; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 7693; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 7694; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 7695; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7696; GFX90A-NEXT: buffer_wbinvl1 7697; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7698; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7699; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7700; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 7701; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7702; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7703; GFX90A-NEXT: v_mov_b32_e32 v0, v3 7704; GFX90A-NEXT: s_setpc_b64 s[30:31] 7705; 7706; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7707; GFX908: ; %bb.0: 7708; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7709; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 7710; GFX908-NEXT: s_mov_b64 s[4:5], 0 7711; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 7712; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start 7713; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7714; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7715; GFX908-NEXT: v_mov_b32_e32 v4, v3 7716; GFX908-NEXT: v_sub_f16_e32 v3, v4, v2 7717; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 7718; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 7719; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7720; GFX908-NEXT: buffer_wbinvl1 7721; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7722; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7723; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7724; GFX908-NEXT: s_cbranch_execnz .LBB28_1 7725; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7726; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7727; GFX908-NEXT: v_mov_b32_e32 v0, v3 7728; GFX908-NEXT: s_setpc_b64 s[30:31] 7729; 7730; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7731; GFX8: ; %bb.0: 7732; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7733; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 7734; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 7735; GFX8-NEXT: flat_load_dword v0, v[3:4] 7736; GFX8-NEXT: s_mov_b64 s[4:5], 0 7737; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start 7738; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7739; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7740; GFX8-NEXT: v_mov_b32_e32 v1, v0 7741; GFX8-NEXT: v_sub_f16_e32 v0, v1, v2 7742; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7743; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 7744; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 7745; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7746; GFX8-NEXT: buffer_wbinvl1 7747; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 7748; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7749; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7750; GFX8-NEXT: s_cbranch_execnz .LBB28_1 7751; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7752; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7753; GFX8-NEXT: s_setpc_b64 s[30:31] 7754; 7755; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: 7756; GFX7: ; %bb.0: 7757; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7758; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 7759; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7760; GFX7-NEXT: flat_load_dword v3, v[0:1] 7761; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7762; GFX7-NEXT: s_mov_b64 s[4:5], 0 7763; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 7764; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start 7765; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7766; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7767; GFX7-NEXT: v_mov_b32_e32 v4, v3 7768; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 7769; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 7770; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 7771; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 7772; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 7773; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7774; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7775; GFX7-NEXT: buffer_wbinvl1 7776; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7777; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7778; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7779; GFX7-NEXT: s_cbranch_execnz .LBB28_1 7780; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7781; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7782; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 7783; GFX7-NEXT: s_setpc_b64 s[30:31] 7784 %gep = getelementptr half, ptr %ptr, i64 1023 7785 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 7786 ret half %result 7787} 7788 7789define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 { 7790; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7791; GFX12: ; %bb.0: 7792; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7793; GFX12-NEXT: s_wait_expcnt 0x0 7794; GFX12-NEXT: s_wait_samplecnt 0x0 7795; GFX12-NEXT: s_wait_bvhcnt 0x0 7796; GFX12-NEXT: s_wait_kmcnt 0x0 7797; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 7798; GFX12-NEXT: s_mov_b32 s0, 0 7799; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start 7800; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7801; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7802; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 7803; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7804; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 7805; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7806; GFX12-NEXT: s_wait_storecnt 0x0 7807; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7808; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7809; GFX12-NEXT: global_inv scope:SCOPE_DEV 7810; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7811; GFX12-NEXT: v_mov_b32_e32 v4, v3 7812; GFX12-NEXT: s_wait_alu 0xfffe 7813; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7814; GFX12-NEXT: s_wait_alu 0xfffe 7815; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7816; GFX12-NEXT: s_cbranch_execnz .LBB29_1 7817; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7818; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7819; GFX12-NEXT: s_wait_alu 0xfffe 7820; GFX12-NEXT: s_setpc_b64 s[30:31] 7821; 7822; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7823; GFX940: ; %bb.0: 7824; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7825; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 7826; GFX940-NEXT: s_mov_b64 s[0:1], 0 7827; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 7828; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start 7829; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7830; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7831; GFX940-NEXT: v_sub_f16_e32 v3, v5, v2 7832; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 7833; GFX940-NEXT: buffer_wbl2 sc1 7834; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 7835; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7836; GFX940-NEXT: buffer_inv sc1 7837; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7838; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7839; GFX940-NEXT: v_mov_b32_e32 v5, v3 7840; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7841; GFX940-NEXT: s_cbranch_execnz .LBB29_1 7842; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7843; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7844; GFX940-NEXT: s_setpc_b64 s[30:31] 7845; 7846; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7847; GFX11: ; %bb.0: 7848; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7849; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 7850; GFX11-NEXT: s_mov_b32 s0, 0 7851; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start 7852; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7853; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7854; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 7855; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7856; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7857; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7858; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7859; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 7860; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7861; GFX11-NEXT: buffer_gl1_inv 7862; GFX11-NEXT: buffer_gl0_inv 7863; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7864; GFX11-NEXT: v_mov_b32_e32 v4, v3 7865; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7866; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7867; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7868; GFX11-NEXT: s_cbranch_execnz .LBB29_1 7869; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7870; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7871; GFX11-NEXT: s_setpc_b64 s[30:31] 7872; 7873; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7874; GFX10: ; %bb.0: 7875; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7876; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 7877; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7878; GFX10-NEXT: s_mov_b32 s4, 0 7879; GFX10-NEXT: flat_load_dword v4, v[0:1] 7880; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start 7881; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7882; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7883; GFX10-NEXT: v_sub_f16_e32 v3, v4, v2 7884; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 7885; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7886; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7887; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7888; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7889; GFX10-NEXT: buffer_gl1_inv 7890; GFX10-NEXT: buffer_gl0_inv 7891; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7892; GFX10-NEXT: v_mov_b32_e32 v4, v3 7893; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7894; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7895; GFX10-NEXT: s_cbranch_execnz .LBB29_1 7896; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7897; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7898; GFX10-NEXT: s_setpc_b64 s[30:31] 7899; 7900; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7901; GFX90A: ; %bb.0: 7902; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7903; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 7904; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7905; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 7906; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start 7907; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7908; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7909; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 7910; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 7911; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 7912; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7913; GFX90A-NEXT: buffer_wbinvl1 7914; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7915; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7916; GFX90A-NEXT: v_mov_b32_e32 v5, v3 7917; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7918; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 7919; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7920; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7921; GFX90A-NEXT: s_setpc_b64 s[30:31] 7922; 7923; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7924; GFX908: ; %bb.0: 7925; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7926; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 7927; GFX908-NEXT: s_mov_b64 s[4:5], 0 7928; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 7929; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start 7930; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7931; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7932; GFX908-NEXT: v_sub_f16_e32 v3, v4, v2 7933; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 7934; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 7935; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7936; GFX908-NEXT: buffer_wbinvl1 7937; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7938; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7939; GFX908-NEXT: v_mov_b32_e32 v4, v3 7940; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7941; GFX908-NEXT: s_cbranch_execnz .LBB29_1 7942; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7943; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7944; GFX908-NEXT: s_setpc_b64 s[30:31] 7945; 7946; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7947; GFX8: ; %bb.0: 7948; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7949; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 7950; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7951; GFX8-NEXT: flat_load_dword v4, v[0:1] 7952; GFX8-NEXT: s_mov_b64 s[4:5], 0 7953; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start 7954; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7955; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7956; GFX8-NEXT: v_sub_f16_e32 v3, v4, v2 7957; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 7958; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 7959; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7960; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7961; GFX8-NEXT: buffer_wbinvl1 7962; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7963; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7964; GFX8-NEXT: v_mov_b32_e32 v4, v3 7965; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7966; GFX8-NEXT: s_cbranch_execnz .LBB29_1 7967; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7968; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7969; GFX8-NEXT: s_setpc_b64 s[30:31] 7970; 7971; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: 7972; GFX7: ; %bb.0: 7973; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7974; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 7975; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7976; GFX7-NEXT: flat_load_dword v3, v[0:1] 7977; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7978; GFX7-NEXT: s_mov_b64 s[4:5], 0 7979; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 7980; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start 7981; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7982; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7983; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 7984; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 7985; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 7986; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7987; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 7988; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7989; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7990; GFX7-NEXT: buffer_wbinvl1 7991; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7992; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7993; GFX7-NEXT: v_mov_b32_e32 v3, v2 7994; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7995; GFX7-NEXT: s_cbranch_execnz .LBB29_1 7996; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7997; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7998; GFX7-NEXT: s_setpc_b64 s[30:31] 7999 %gep = getelementptr half, ptr %ptr, i64 1023 8000 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 8001 ret void 8002} 8003 8004define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 { 8005; GFX12-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8006; GFX12: ; %bb.0: 8007; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8008; GFX12-NEXT: s_wait_expcnt 0x0 8009; GFX12-NEXT: s_wait_samplecnt 0x0 8010; GFX12-NEXT: s_wait_bvhcnt 0x0 8011; GFX12-NEXT: s_wait_kmcnt 0x0 8012; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8013; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8014; GFX12-NEXT: s_mov_b32 s0, 0 8015; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8016; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8017; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8018; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8019; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8020; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8021; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8022; GFX12-NEXT: v_not_b32_e32 v4, v4 8023; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start 8024; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8025; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8026; GFX12-NEXT: v_mov_b32_e32 v6, v5 8027; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8028; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8029; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 8030; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8031; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 8032; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8033; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8034; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8035; GFX12-NEXT: global_wb scope:SCOPE_SYS 8036; GFX12-NEXT: s_wait_storecnt 0x0 8037; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 8038; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8039; GFX12-NEXT: global_inv scope:SCOPE_SYS 8040; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8041; GFX12-NEXT: s_wait_alu 0xfffe 8042; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8043; GFX12-NEXT: s_wait_alu 0xfffe 8044; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8045; GFX12-NEXT: s_cbranch_execnz .LBB30_1 8046; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8047; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8048; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8049; GFX12-NEXT: s_wait_alu 0xfffe 8050; GFX12-NEXT: s_setpc_b64 s[30:31] 8051; 8052; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8053; GFX940: ; %bb.0: 8054; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8055; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 8056; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] 8057; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 8058; GFX940-NEXT: v_mov_b32_e32 v1, v7 8059; GFX940-NEXT: flat_load_dword v4, v[0:1] 8060; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 8061; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8062; GFX940-NEXT: s_mov_b32 s0, 0xffff 8063; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 8064; GFX940-NEXT: v_not_b32_e32 v5, v5 8065; GFX940-NEXT: s_mov_b64 s[0:1], 0 8066; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start 8067; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8068; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8069; GFX940-NEXT: v_mov_b32_e32 v7, v4 8070; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8071; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 8072; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8073; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 8074; GFX940-NEXT: buffer_wbl2 sc0 sc1 8075; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 8076; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8077; GFX940-NEXT: buffer_inv sc0 sc1 8078; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8079; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8080; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8081; GFX940-NEXT: s_cbranch_execnz .LBB30_1 8082; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8083; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8084; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8085; GFX940-NEXT: s_setpc_b64 s[30:31] 8086; 8087; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8088; GFX11: ; %bb.0: 8089; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8090; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8091; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8092; GFX11-NEXT: s_mov_b32 s0, 0 8093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8094; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8095; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8096; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8097; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8098; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8099; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8100; GFX11-NEXT: v_not_b32_e32 v4, v4 8101; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start 8102; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8103; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8104; GFX11-NEXT: v_mov_b32_e32 v6, v5 8105; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8106; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8107; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 8108; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8109; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 8110; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8111; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8112; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8113; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8114; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8115; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8116; GFX11-NEXT: buffer_gl1_inv 8117; GFX11-NEXT: buffer_gl0_inv 8118; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8119; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8120; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8121; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8122; GFX11-NEXT: s_cbranch_execnz .LBB30_1 8123; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8124; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8125; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8126; GFX11-NEXT: s_setpc_b64 s[30:31] 8127; 8128; GFX10-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8129; GFX10: ; %bb.0: 8130; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8131; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8132; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8133; GFX10-NEXT: s_mov_b32 s4, 0 8134; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8135; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8136; GFX10-NEXT: flat_load_dword v5, v[0:1] 8137; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8138; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8139; GFX10-NEXT: v_not_b32_e32 v4, v4 8140; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start 8141; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8142; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8143; GFX10-NEXT: v_mov_b32_e32 v6, v5 8144; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8145; GFX10-NEXT: v_sub_f16_e32 v5, v5, v2 8146; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8147; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8148; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8149; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8150; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8151; GFX10-NEXT: buffer_gl1_inv 8152; GFX10-NEXT: buffer_gl0_inv 8153; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8154; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8155; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8156; GFX10-NEXT: s_cbranch_execnz .LBB30_1 8157; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8158; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8159; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8160; GFX10-NEXT: s_setpc_b64 s[30:31] 8161; 8162; GFX90A-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8163; GFX90A: ; %bb.0: 8164; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8165; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8166; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8167; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8168; GFX90A-NEXT: flat_load_dword v4, v[0:1] 8169; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8170; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8171; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8172; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8173; GFX90A-NEXT: v_not_b32_e32 v5, v5 8174; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8175; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start 8176; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8177; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8178; GFX90A-NEXT: v_mov_b32_e32 v7, v4 8179; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8180; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 8181; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8182; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 8183; GFX90A-NEXT: buffer_wbl2 8184; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8185; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8186; GFX90A-NEXT: buffer_invl2 8187; GFX90A-NEXT: buffer_wbinvl1 8188; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8189; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8190; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8191; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 8192; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8193; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8194; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8195; GFX90A-NEXT: s_setpc_b64 s[30:31] 8196; 8197; GFX908-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8198; GFX908: ; %bb.0: 8199; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8200; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8201; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8202; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8203; GFX908-NEXT: flat_load_dword v4, v[0:1] 8204; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8205; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8206; GFX908-NEXT: s_mov_b32 s4, 0xffff 8207; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 8208; GFX908-NEXT: v_not_b32_e32 v5, v5 8209; GFX908-NEXT: s_mov_b64 s[4:5], 0 8210; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start 8211; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8212; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8213; GFX908-NEXT: v_mov_b32_e32 v7, v4 8214; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 8215; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 8216; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8217; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 8218; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc 8219; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8220; GFX908-NEXT: buffer_wbinvl1 8221; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 8222; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8223; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8224; GFX908-NEXT: s_cbranch_execnz .LBB30_1 8225; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8226; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8227; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 8228; GFX908-NEXT: s_setpc_b64 s[30:31] 8229; 8230; GFX8-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8231; GFX8: ; %bb.0: 8232; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8233; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 8234; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8235; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8236; GFX8-NEXT: flat_load_dword v5, v[0:1] 8237; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8238; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8239; GFX8-NEXT: s_mov_b32 s4, 0xffff 8240; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8241; GFX8-NEXT: v_not_b32_e32 v4, v4 8242; GFX8-NEXT: s_mov_b64 s[4:5], 0 8243; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start 8244; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8245; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8246; GFX8-NEXT: v_mov_b32_e32 v6, v5 8247; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8248; GFX8-NEXT: v_sub_f16_e32 v5, v5, v2 8249; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8250; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8251; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8252; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8253; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8254; GFX8-NEXT: buffer_wbinvl1 8255; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8256; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8257; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8258; GFX8-NEXT: s_cbranch_execnz .LBB30_1 8259; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8260; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8261; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8262; GFX8-NEXT: s_setpc_b64 s[30:31] 8263; 8264; GFX7-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: 8265; GFX7: ; %bb.0: 8266; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8267; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 8268; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8269; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8270; GFX7-NEXT: flat_load_dword v5, v[0:1] 8271; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 8272; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 8273; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 8274; GFX7-NEXT: s_mov_b64 s[4:5], 0 8275; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8276; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 8277; GFX7-NEXT: v_not_b32_e32 v4, v4 8278; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start 8279; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8280; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8281; GFX7-NEXT: v_mov_b32_e32 v6, v5 8282; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 8283; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 8284; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8285; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 8286; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 8287; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 8288; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8289; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8290; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8291; GFX7-NEXT: buffer_wbinvl1 8292; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8293; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8294; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8295; GFX7-NEXT: s_cbranch_execnz .LBB30_1 8296; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8297; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8298; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 8299; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 8300; GFX7-NEXT: s_setpc_b64 s[30:31] 8301 %gep = getelementptr half, ptr %ptr, i64 1023 8302 %result = atomicrmw fsub ptr %gep, half %val seq_cst 8303 ret half %result 8304} 8305 8306define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 { 8307; GFX12-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8308; GFX12: ; %bb.0: 8309; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8310; GFX12-NEXT: s_wait_expcnt 0x0 8311; GFX12-NEXT: s_wait_samplecnt 0x0 8312; GFX12-NEXT: s_wait_bvhcnt 0x0 8313; GFX12-NEXT: s_wait_kmcnt 0x0 8314; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8315; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8316; GFX12-NEXT: s_mov_b32 s0, 0 8317; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8318; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8319; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8320; GFX12-NEXT: flat_load_b32 v4, v[0:1] 8321; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 8322; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 8323; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8324; GFX12-NEXT: v_not_b32_e32 v6, v3 8325; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start 8326; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8327; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8328; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 8329; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8330; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 8331; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 8332; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8333; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 8334; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 8335; GFX12-NEXT: global_wb scope:SCOPE_SYS 8336; GFX12-NEXT: s_wait_storecnt 0x0 8337; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 8338; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8339; GFX12-NEXT: global_inv scope:SCOPE_SYS 8340; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 8341; GFX12-NEXT: v_mov_b32_e32 v4, v3 8342; GFX12-NEXT: s_wait_alu 0xfffe 8343; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8344; GFX12-NEXT: s_wait_alu 0xfffe 8345; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8346; GFX12-NEXT: s_cbranch_execnz .LBB31_1 8347; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8348; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8349; GFX12-NEXT: s_wait_alu 0xfffe 8350; GFX12-NEXT: s_setpc_b64 s[30:31] 8351; 8352; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8353; GFX940: ; %bb.0: 8354; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8355; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 8356; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 8357; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 8358; GFX940-NEXT: v_mov_b32_e32 v1, v5 8359; GFX940-NEXT: flat_load_dword v5, v[0:1] 8360; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 8361; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8362; GFX940-NEXT: s_mov_b32 s0, 0xffff 8363; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 8364; GFX940-NEXT: v_not_b32_e32 v6, v4 8365; GFX940-NEXT: s_mov_b64 s[0:1], 0 8366; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start 8367; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8368; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8369; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 8370; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 8371; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8372; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 8373; GFX940-NEXT: buffer_wbl2 sc0 sc1 8374; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 8375; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8376; GFX940-NEXT: buffer_inv sc0 sc1 8377; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 8378; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8379; GFX940-NEXT: v_mov_b32_e32 v5, v4 8380; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8381; GFX940-NEXT: s_cbranch_execnz .LBB31_1 8382; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8383; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8384; GFX940-NEXT: s_setpc_b64 s[30:31] 8385; 8386; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8387; GFX11: ; %bb.0: 8388; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8389; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8390; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8391; GFX11-NEXT: s_mov_b32 s0, 0 8392; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8393; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8394; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8395; GFX11-NEXT: flat_load_b32 v4, v[0:1] 8396; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 8397; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 8398; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8399; GFX11-NEXT: v_not_b32_e32 v6, v3 8400; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start 8401; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8402; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8403; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 8404; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8405; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 8406; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 8407; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8408; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 8409; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 8410; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8411; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 8412; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8413; GFX11-NEXT: buffer_gl1_inv 8414; GFX11-NEXT: buffer_gl0_inv 8415; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 8416; GFX11-NEXT: v_mov_b32_e32 v4, v3 8417; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8418; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8419; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8420; GFX11-NEXT: s_cbranch_execnz .LBB31_1 8421; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8422; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8423; GFX11-NEXT: s_setpc_b64 s[30:31] 8424; 8425; GFX10-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8426; GFX10: ; %bb.0: 8427; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8428; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8429; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8430; GFX10-NEXT: s_mov_b32 s4, 0 8431; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8432; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8433; GFX10-NEXT: flat_load_dword v4, v[0:1] 8434; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 8435; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 8436; GFX10-NEXT: v_not_b32_e32 v6, v3 8437; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start 8438; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8439; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8440; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 8441; GFX10-NEXT: v_sub_f16_e32 v3, v3, v2 8442; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8443; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 8444; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8445; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 8446; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8447; GFX10-NEXT: buffer_gl1_inv 8448; GFX10-NEXT: buffer_gl0_inv 8449; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 8450; GFX10-NEXT: v_mov_b32_e32 v4, v3 8451; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8452; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8453; GFX10-NEXT: s_cbranch_execnz .LBB31_1 8454; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8455; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8456; GFX10-NEXT: s_setpc_b64 s[30:31] 8457; 8458; GFX90A-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8459; GFX90A: ; %bb.0: 8460; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8461; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8462; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8463; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8464; GFX90A-NEXT: flat_load_dword v5, v[0:1] 8465; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8466; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8467; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8468; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8469; GFX90A-NEXT: v_not_b32_e32 v6, v4 8470; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8471; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start 8472; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8473; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8474; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 8475; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 8476; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 8477; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 8478; GFX90A-NEXT: buffer_wbl2 8479; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 8480; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8481; GFX90A-NEXT: buffer_invl2 8482; GFX90A-NEXT: buffer_wbinvl1 8483; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 8484; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8485; GFX90A-NEXT: v_mov_b32_e32 v5, v4 8486; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8487; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 8488; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8489; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8490; GFX90A-NEXT: s_setpc_b64 s[30:31] 8491; 8492; GFX908-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8493; GFX908: ; %bb.0: 8494; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8495; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8496; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8497; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8498; GFX908-NEXT: flat_load_dword v4, v[0:1] 8499; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8500; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 8501; GFX908-NEXT: s_mov_b32 s4, 0xffff 8502; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 8503; GFX908-NEXT: v_not_b32_e32 v6, v3 8504; GFX908-NEXT: s_mov_b64 s[4:5], 0 8505; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start 8506; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8507; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8508; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 8509; GFX908-NEXT: v_sub_f16_e32 v3, v3, v2 8510; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 8511; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 8512; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 8513; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8514; GFX908-NEXT: buffer_wbinvl1 8515; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8516; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8517; GFX908-NEXT: v_mov_b32_e32 v4, v3 8518; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8519; GFX908-NEXT: s_cbranch_execnz .LBB31_1 8520; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8521; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8522; GFX908-NEXT: s_setpc_b64 s[30:31] 8523; 8524; GFX8-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8525; GFX8: ; %bb.0: 8526; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8527; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 8528; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8529; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8530; GFX8-NEXT: flat_load_dword v4, v[0:1] 8531; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8532; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 8533; GFX8-NEXT: s_mov_b32 s4, 0xffff 8534; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 8535; GFX8-NEXT: v_not_b32_e32 v6, v3 8536; GFX8-NEXT: s_mov_b64 s[4:5], 0 8537; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start 8538; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8539; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8540; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 8541; GFX8-NEXT: v_sub_f16_e32 v3, v3, v2 8542; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 8543; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 8544; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 8545; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 8546; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8547; GFX8-NEXT: buffer_wbinvl1 8548; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8549; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8550; GFX8-NEXT: v_mov_b32_e32 v4, v3 8551; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8552; GFX8-NEXT: s_cbranch_execnz .LBB31_1 8553; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8554; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8555; GFX8-NEXT: s_setpc_b64 s[30:31] 8556; 8557; GFX7-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: 8558; GFX7: ; %bb.0: 8559; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8560; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 8561; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8562; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 8563; GFX7-NEXT: flat_load_dword v3, v[0:1] 8564; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8565; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 8566; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8567; GFX7-NEXT: s_mov_b64 s[4:5], 0 8568; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 8569; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 8570; GFX7-NEXT: v_not_b32_e32 v6, v2 8571; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start 8572; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8573; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8574; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8575; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 8576; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 8577; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 8578; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8579; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8580; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 8581; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8582; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8583; GFX7-NEXT: buffer_wbinvl1 8584; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8585; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8586; GFX7-NEXT: v_mov_b32_e32 v3, v2 8587; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8588; GFX7-NEXT: s_cbranch_execnz .LBB31_1 8589; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8590; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8591; GFX7-NEXT: s_setpc_b64 s[30:31] 8592 %gep = getelementptr half, ptr %ptr, i64 1023 8593 %unused = atomicrmw fsub ptr %gep, half %val seq_cst 8594 ret void 8595} 8596 8597; -------------------------------------------------------------------- 8598; bfloat 8599; -------------------------------------------------------------------- 8600 8601define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { 8602; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16: 8603; GFX12: ; %bb.0: 8604; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8605; GFX12-NEXT: s_wait_expcnt 0x0 8606; GFX12-NEXT: s_wait_samplecnt 0x0 8607; GFX12-NEXT: s_wait_bvhcnt 0x0 8608; GFX12-NEXT: s_wait_kmcnt 0x0 8609; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 8610; GFX12-NEXT: s_mov_b32 s0, 0 8611; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8612; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8613; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8614; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8615; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8616; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8617; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8618; GFX12-NEXT: v_not_b32_e32 v4, v4 8619; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start 8620; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8621; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8622; GFX12-NEXT: v_mov_b32_e32 v6, v5 8623; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8624; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8625; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8626; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8627; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 8628; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 8629; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 8630; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8631; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 8632; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8633; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 8634; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8635; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8636; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8637; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8638; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8639; GFX12-NEXT: s_wait_storecnt 0x0 8640; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8641; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8642; GFX12-NEXT: global_inv scope:SCOPE_DEV 8643; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8644; GFX12-NEXT: s_wait_alu 0xfffe 8645; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8646; GFX12-NEXT: s_wait_alu 0xfffe 8647; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8648; GFX12-NEXT: s_cbranch_execnz .LBB32_1 8649; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8650; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8651; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8652; GFX12-NEXT: s_wait_alu 0xfffe 8653; GFX12-NEXT: s_setpc_b64 s[30:31] 8654; 8655; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16: 8656; GFX940: ; %bb.0: 8657; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8658; GFX940-NEXT: v_mov_b32_e32 v3, v0 8659; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 8660; GFX940-NEXT: flat_load_dword v5, v[0:1] 8661; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 8662; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8663; GFX940-NEXT: s_mov_b32 s0, 0xffff 8664; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 8665; GFX940-NEXT: v_not_b32_e32 v4, v4 8666; GFX940-NEXT: s_mov_b64 s[0:1], 0 8667; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8668; GFX940-NEXT: s_movk_i32 s2, 0x7fff 8669; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start 8670; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8671; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8672; GFX940-NEXT: v_mov_b32_e32 v7, v5 8673; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8674; GFX940-NEXT: s_nop 0 8675; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 8676; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 8677; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 8678; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 8679; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8680; GFX940-NEXT: s_nop 1 8681; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 8682; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8683; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 8684; GFX940-NEXT: buffer_wbl2 sc1 8685; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 8686; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8687; GFX940-NEXT: buffer_inv sc1 8688; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 8689; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8690; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8691; GFX940-NEXT: s_cbranch_execnz .LBB32_1 8692; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8693; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8694; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8695; GFX940-NEXT: s_setpc_b64 s[30:31] 8696; 8697; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16: 8698; GFX11: ; %bb.0: 8699; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8700; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 8701; GFX11-NEXT: s_mov_b32 s0, 0 8702; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8703; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8704; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8705; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8706; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8707; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8709; GFX11-NEXT: v_not_b32_e32 v4, v4 8710; GFX11-NEXT: .p2align 6 8711; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start 8712; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8713; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8714; GFX11-NEXT: v_mov_b32_e32 v6, v5 8715; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8716; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8717; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8718; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8719; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 8720; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 8721; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 8722; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8723; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 8724; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8725; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 8726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8727; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8728; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8729; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8730; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8731; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8732; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8733; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8734; GFX11-NEXT: buffer_gl1_inv 8735; GFX11-NEXT: buffer_gl0_inv 8736; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8737; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8738; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8739; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8740; GFX11-NEXT: s_cbranch_execnz .LBB32_1 8741; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8742; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8743; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8744; GFX11-NEXT: s_setpc_b64 s[30:31] 8745; 8746; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16: 8747; GFX10: ; %bb.0: 8748; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8749; GFX10-NEXT: v_mov_b32_e32 v3, v0 8750; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8751; GFX10-NEXT: s_mov_b32 s4, 0 8752; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8753; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8754; GFX10-NEXT: flat_load_dword v5, v[0:1] 8755; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8756; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8757; GFX10-NEXT: v_not_b32_e32 v4, v4 8758; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start 8759; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8760; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8761; GFX10-NEXT: v_mov_b32_e32 v6, v5 8762; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8763; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 8764; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 8765; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 8766; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8767; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8768; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 8769; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8770; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8771; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8772; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8773; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8774; GFX10-NEXT: buffer_gl1_inv 8775; GFX10-NEXT: buffer_gl0_inv 8776; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8777; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8778; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8779; GFX10-NEXT: s_cbranch_execnz .LBB32_1 8780; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8781; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8782; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8783; GFX10-NEXT: s_setpc_b64 s[30:31] 8784; 8785; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16: 8786; GFX90A: ; %bb.0: 8787; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8788; GFX90A-NEXT: v_mov_b32_e32 v3, v0 8789; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8790; GFX90A-NEXT: flat_load_dword v5, v[0:1] 8791; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8792; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8793; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8794; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8795; GFX90A-NEXT: v_not_b32_e32 v4, v4 8796; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8797; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8798; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 8799; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start 8800; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8801; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8802; GFX90A-NEXT: v_mov_b32_e32 v7, v5 8803; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8804; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 8805; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 8806; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 8807; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 8808; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8809; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 8810; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8811; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 8812; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 8813; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8814; GFX90A-NEXT: buffer_wbinvl1 8815; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 8816; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8817; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8818; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 8819; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8820; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8821; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8822; GFX90A-NEXT: s_setpc_b64 s[30:31] 8823; 8824; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16: 8825; GFX908: ; %bb.0: 8826; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8827; GFX908-NEXT: v_mov_b32_e32 v3, v0 8828; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8829; GFX908-NEXT: flat_load_dword v5, v[0:1] 8830; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8831; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8832; GFX908-NEXT: s_mov_b32 s4, 0xffff 8833; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8834; GFX908-NEXT: v_not_b32_e32 v4, v4 8835; GFX908-NEXT: s_mov_b64 s[4:5], 0 8836; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8837; GFX908-NEXT: s_movk_i32 s6, 0x7fff 8838; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start 8839; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8840; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8841; GFX908-NEXT: v_mov_b32_e32 v6, v5 8842; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8843; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 8844; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 8845; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 8846; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 8847; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8848; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 8849; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8850; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 8851; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8852; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8853; GFX908-NEXT: buffer_wbinvl1 8854; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8855; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8856; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8857; GFX908-NEXT: s_cbranch_execnz .LBB32_1 8858; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8859; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8860; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8861; GFX908-NEXT: s_setpc_b64 s[30:31] 8862; 8863; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16: 8864; GFX8: ; %bb.0: 8865; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8866; GFX8-NEXT: v_mov_b32_e32 v3, v0 8867; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8868; GFX8-NEXT: flat_load_dword v5, v[0:1] 8869; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8870; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8871; GFX8-NEXT: s_mov_b32 s4, 0xffff 8872; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8873; GFX8-NEXT: v_not_b32_e32 v4, v4 8874; GFX8-NEXT: s_mov_b64 s[4:5], 0 8875; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8876; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start 8877; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8878; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8879; GFX8-NEXT: v_mov_b32_e32 v6, v5 8880; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8881; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 8882; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 8883; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 8884; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 8885; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 8886; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8887; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 8888; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8889; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 8890; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8891; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8892; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8893; GFX8-NEXT: buffer_wbinvl1 8894; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8895; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8896; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8897; GFX8-NEXT: s_cbranch_execnz .LBB32_1 8898; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8899; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8900; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8901; GFX8-NEXT: s_setpc_b64 s[30:31] 8902; 8903; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16: 8904; GFX7: ; %bb.0: 8905; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8906; GFX7-NEXT: v_mov_b32_e32 v3, v0 8907; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8908; GFX7-NEXT: flat_load_dword v5, v[0:1] 8909; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 8910; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8911; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 8912; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 8913; GFX7-NEXT: v_not_b32_e32 v4, v4 8914; GFX7-NEXT: s_mov_b64 s[4:5], 0 8915; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 8916; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start 8917; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8918; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8919; GFX7-NEXT: v_mov_b32_e32 v6, v5 8920; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8921; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8922; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 8923; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8924; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8925; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8926; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8927; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8928; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8929; GFX7-NEXT: buffer_wbinvl1 8930; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8931; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8932; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8933; GFX7-NEXT: s_cbranch_execnz .LBB32_1 8934; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8935; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8936; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8937; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 8938; GFX7-NEXT: s_setpc_b64 s[30:31] 8939 %result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst 8940 ret bfloat %result 8941} 8942 8943define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { 8944; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 8945; GFX12: ; %bb.0: 8946; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8947; GFX12-NEXT: s_wait_expcnt 0x0 8948; GFX12-NEXT: s_wait_samplecnt 0x0 8949; GFX12-NEXT: s_wait_bvhcnt 0x0 8950; GFX12-NEXT: s_wait_kmcnt 0x0 8951; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8952; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8953; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8954; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 8955; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8956; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8957; GFX12-NEXT: s_mov_b32 s0, 0 8958; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8959; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8960; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8961; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8962; GFX12-NEXT: v_not_b32_e32 v4, v4 8963; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start 8964; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8965; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8966; GFX12-NEXT: v_mov_b32_e32 v6, v5 8967; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8968; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8969; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8970; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8971; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 8972; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 8973; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 8974; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8975; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 8976; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8977; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 8978; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8979; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8980; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8981; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8982; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8983; GFX12-NEXT: s_wait_storecnt 0x0 8984; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8985; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8986; GFX12-NEXT: global_inv scope:SCOPE_DEV 8987; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8988; GFX12-NEXT: s_wait_alu 0xfffe 8989; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8990; GFX12-NEXT: s_wait_alu 0xfffe 8991; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8992; GFX12-NEXT: s_cbranch_execnz .LBB33_1 8993; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8994; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8995; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8996; GFX12-NEXT: s_wait_alu 0xfffe 8997; GFX12-NEXT: s_setpc_b64 s[30:31] 8998; 8999; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9000; GFX940: ; %bb.0: 9001; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9002; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 9003; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9004; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9005; GFX940-NEXT: v_mov_b32_e32 v1, v5 9006; GFX940-NEXT: flat_load_dword v5, v[0:1] 9007; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9008; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9009; GFX940-NEXT: s_mov_b32 s0, 0xffff 9010; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9011; GFX940-NEXT: v_not_b32_e32 v4, v4 9012; GFX940-NEXT: s_mov_b64 s[0:1], 0 9013; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9014; GFX940-NEXT: s_movk_i32 s2, 0x7fff 9015; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start 9016; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9017; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9018; GFX940-NEXT: v_mov_b32_e32 v7, v5 9019; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9020; GFX940-NEXT: s_nop 0 9021; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 9022; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 9023; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 9024; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 9025; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9026; GFX940-NEXT: s_nop 1 9027; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9028; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9029; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 9030; GFX940-NEXT: buffer_wbl2 sc1 9031; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 9032; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9033; GFX940-NEXT: buffer_inv sc1 9034; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9035; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9036; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9037; GFX940-NEXT: s_cbranch_execnz .LBB33_1 9038; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9039; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9040; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9041; GFX940-NEXT: s_setpc_b64 s[30:31] 9042; 9043; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9044; GFX11: ; %bb.0: 9045; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9046; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9047; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9048; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9049; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9050; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9051; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9052; GFX11-NEXT: s_mov_b32 s0, 0 9053; GFX11-NEXT: flat_load_b32 v5, v[0:1] 9054; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9055; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9056; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9057; GFX11-NEXT: v_not_b32_e32 v4, v4 9058; GFX11-NEXT: .p2align 6 9059; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start 9060; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9061; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9062; GFX11-NEXT: v_mov_b32_e32 v6, v5 9063; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9064; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9065; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9066; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9067; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 9068; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 9069; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 9070; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9071; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9072; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9073; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9074; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9075; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9076; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9077; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9078; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 9079; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9080; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 9081; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9082; GFX11-NEXT: buffer_gl1_inv 9083; GFX11-NEXT: buffer_gl0_inv 9084; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9085; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9086; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9087; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9088; GFX11-NEXT: s_cbranch_execnz .LBB33_1 9089; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9090; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9091; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9092; GFX11-NEXT: s_setpc_b64 s[30:31] 9093; 9094; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9095; GFX10: ; %bb.0: 9096; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9097; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9098; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9099; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9100; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9101; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9102; GFX10-NEXT: s_mov_b32 s4, 0 9103; GFX10-NEXT: flat_load_dword v5, v[0:1] 9104; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9105; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9106; GFX10-NEXT: v_not_b32_e32 v4, v4 9107; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start 9108; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9109; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9110; GFX10-NEXT: v_mov_b32_e32 v6, v5 9111; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9112; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 9113; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 9114; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 9115; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9116; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9117; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9118; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9119; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 9120; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9121; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9122; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9123; GFX10-NEXT: buffer_gl1_inv 9124; GFX10-NEXT: buffer_gl0_inv 9125; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9126; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9127; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9128; GFX10-NEXT: s_cbranch_execnz .LBB33_1 9129; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9130; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9131; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9132; GFX10-NEXT: s_setpc_b64 s[30:31] 9133; 9134; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9135; GFX90A: ; %bb.0: 9136; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9137; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9138; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9139; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9140; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9141; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9142; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9143; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9144; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9145; GFX90A-NEXT: v_not_b32_e32 v4, v4 9146; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9147; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9148; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9149; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start 9150; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9151; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9152; GFX90A-NEXT: v_mov_b32_e32 v7, v5 9153; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9154; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 9155; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9156; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 9157; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 9158; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9159; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9160; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9161; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 9162; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 9163; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9164; GFX90A-NEXT: buffer_wbinvl1 9165; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9166; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9167; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9168; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 9169; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9170; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9171; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9172; GFX90A-NEXT: s_setpc_b64 s[30:31] 9173; 9174; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9175; GFX908: ; %bb.0: 9176; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9177; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9178; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9179; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9180; GFX908-NEXT: flat_load_dword v5, v[0:1] 9181; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9182; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9183; GFX908-NEXT: s_mov_b32 s4, 0xffff 9184; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9185; GFX908-NEXT: v_not_b32_e32 v4, v4 9186; GFX908-NEXT: s_mov_b64 s[4:5], 0 9187; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9188; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9189; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start 9190; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9191; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9192; GFX908-NEXT: v_mov_b32_e32 v6, v5 9193; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9194; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 9195; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 9196; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 9197; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 9198; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9199; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 9200; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9201; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 9202; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9203; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9204; GFX908-NEXT: buffer_wbinvl1 9205; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9206; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9207; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9208; GFX908-NEXT: s_cbranch_execnz .LBB33_1 9209; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9210; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9211; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9212; GFX908-NEXT: s_setpc_b64 s[30:31] 9213; 9214; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9215; GFX8: ; %bb.0: 9216; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9217; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 9218; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9219; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9220; GFX8-NEXT: flat_load_dword v5, v[0:1] 9221; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9222; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9223; GFX8-NEXT: s_mov_b32 s4, 0xffff 9224; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9225; GFX8-NEXT: v_not_b32_e32 v4, v4 9226; GFX8-NEXT: s_mov_b64 s[4:5], 0 9227; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9228; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start 9229; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9230; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9231; GFX8-NEXT: v_mov_b32_e32 v6, v5 9232; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9233; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 9234; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 9235; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 9236; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9237; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 9238; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9239; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 9240; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 9241; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9242; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 9243; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9244; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9245; GFX8-NEXT: buffer_wbinvl1 9246; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9247; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9248; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9249; GFX8-NEXT: s_cbranch_execnz .LBB33_1 9250; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9251; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9252; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9253; GFX8-NEXT: s_setpc_b64 s[30:31] 9254; 9255; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: 9256; GFX7: ; %bb.0: 9257; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9258; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 9259; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9260; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9261; GFX7-NEXT: flat_load_dword v5, v[0:1] 9262; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9263; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9264; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 9265; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9266; GFX7-NEXT: v_not_b32_e32 v4, v4 9267; GFX7-NEXT: s_mov_b64 s[4:5], 0 9268; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9269; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start 9270; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9271; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9272; GFX7-NEXT: v_mov_b32_e32 v6, v5 9273; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9274; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9275; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 9276; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9277; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 9278; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9279; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 9280; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9281; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9282; GFX7-NEXT: buffer_wbinvl1 9283; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9284; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9285; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9286; GFX7-NEXT: s_cbranch_execnz .LBB33_1 9287; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9288; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9289; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9290; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9291; GFX7-NEXT: s_setpc_b64 s[30:31] 9292 %gep = getelementptr bfloat, ptr %ptr, i64 1023 9293 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst 9294 ret bfloat %result 9295} 9296 9297define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 { 9298; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9299; GFX12: ; %bb.0: 9300; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9301; GFX12-NEXT: s_wait_expcnt 0x0 9302; GFX12-NEXT: s_wait_samplecnt 0x0 9303; GFX12-NEXT: s_wait_bvhcnt 0x0 9304; GFX12-NEXT: s_wait_kmcnt 0x0 9305; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9306; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9307; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9308; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9309; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9310; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9311; GFX12-NEXT: s_mov_b32 s0, 0 9312; GFX12-NEXT: flat_load_b32 v5, v[0:1] 9313; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9314; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9315; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9316; GFX12-NEXT: v_not_b32_e32 v4, v4 9317; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start 9318; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9319; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9320; GFX12-NEXT: v_mov_b32_e32 v6, v5 9321; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9322; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9323; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9324; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9325; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 9326; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 9327; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 9328; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9329; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9330; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9331; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9332; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9333; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9334; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9335; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9336; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 9337; GFX12-NEXT: s_wait_storecnt 0x0 9338; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9339; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9340; GFX12-NEXT: global_inv scope:SCOPE_DEV 9341; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9342; GFX12-NEXT: s_wait_alu 0xfffe 9343; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9344; GFX12-NEXT: s_wait_alu 0xfffe 9345; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9346; GFX12-NEXT: s_cbranch_execnz .LBB34_1 9347; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9348; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9349; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9350; GFX12-NEXT: s_wait_alu 0xfffe 9351; GFX12-NEXT: s_setpc_b64 s[30:31] 9352; 9353; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9354; GFX940: ; %bb.0: 9355; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9356; GFX940-NEXT: s_movk_i32 s0, 0xf800 9357; GFX940-NEXT: s_mov_b32 s1, -1 9358; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9359; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9360; GFX940-NEXT: v_mov_b32_e32 v1, v5 9361; GFX940-NEXT: flat_load_dword v5, v[0:1] 9362; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9363; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9364; GFX940-NEXT: s_mov_b32 s0, 0xffff 9365; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9366; GFX940-NEXT: v_not_b32_e32 v4, v4 9367; GFX940-NEXT: s_mov_b64 s[0:1], 0 9368; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9369; GFX940-NEXT: s_movk_i32 s2, 0x7fff 9370; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start 9371; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9372; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9373; GFX940-NEXT: v_mov_b32_e32 v7, v5 9374; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9375; GFX940-NEXT: s_nop 0 9376; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 9377; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 9378; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 9379; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 9380; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9381; GFX940-NEXT: s_nop 1 9382; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9383; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9384; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 9385; GFX940-NEXT: buffer_wbl2 sc1 9386; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 9387; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9388; GFX940-NEXT: buffer_inv sc1 9389; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9390; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9391; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9392; GFX940-NEXT: s_cbranch_execnz .LBB34_1 9393; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9394; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9395; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9396; GFX940-NEXT: s_setpc_b64 s[30:31] 9397; 9398; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9399; GFX11: ; %bb.0: 9400; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9401; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9402; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9403; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9404; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9405; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9406; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9407; GFX11-NEXT: s_mov_b32 s0, 0 9408; GFX11-NEXT: flat_load_b32 v5, v[0:1] 9409; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9410; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9411; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9412; GFX11-NEXT: v_not_b32_e32 v4, v4 9413; GFX11-NEXT: .p2align 6 9414; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start 9415; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9416; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9417; GFX11-NEXT: v_mov_b32_e32 v6, v5 9418; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9419; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9420; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9421; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9422; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 9423; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 9424; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 9425; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9426; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9427; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9428; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9429; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9430; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9431; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9432; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9433; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 9434; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9435; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 9436; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9437; GFX11-NEXT: buffer_gl1_inv 9438; GFX11-NEXT: buffer_gl0_inv 9439; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9440; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9441; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9442; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9443; GFX11-NEXT: s_cbranch_execnz .LBB34_1 9444; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9445; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9446; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9447; GFX11-NEXT: s_setpc_b64 s[30:31] 9448; 9449; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9450; GFX10: ; %bb.0: 9451; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9452; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9453; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9454; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9455; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9456; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9457; GFX10-NEXT: s_mov_b32 s4, 0 9458; GFX10-NEXT: flat_load_dword v5, v[0:1] 9459; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9460; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9461; GFX10-NEXT: v_not_b32_e32 v4, v4 9462; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start 9463; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9464; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9465; GFX10-NEXT: v_mov_b32_e32 v6, v5 9466; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9467; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 9468; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 9469; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 9470; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9471; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9472; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9473; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9474; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 9475; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9476; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9477; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9478; GFX10-NEXT: buffer_gl1_inv 9479; GFX10-NEXT: buffer_gl0_inv 9480; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9481; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9482; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9483; GFX10-NEXT: s_cbranch_execnz .LBB34_1 9484; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9485; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9486; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9487; GFX10-NEXT: s_setpc_b64 s[30:31] 9488; 9489; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9490; GFX90A: ; %bb.0: 9491; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9492; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9493; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9494; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9495; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9496; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9497; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9498; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9499; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9500; GFX90A-NEXT: v_not_b32_e32 v4, v4 9501; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9502; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9503; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9504; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start 9505; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9506; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9507; GFX90A-NEXT: v_mov_b32_e32 v7, v5 9508; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9509; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 9510; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9511; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 9512; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 9513; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9514; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9515; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9516; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 9517; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 9518; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9519; GFX90A-NEXT: buffer_wbinvl1 9520; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9521; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9522; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9523; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 9524; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9525; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9526; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9527; GFX90A-NEXT: s_setpc_b64 s[30:31] 9528; 9529; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9530; GFX908: ; %bb.0: 9531; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9532; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9533; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9534; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9535; GFX908-NEXT: flat_load_dword v5, v[0:1] 9536; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9537; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9538; GFX908-NEXT: s_mov_b32 s4, 0xffff 9539; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9540; GFX908-NEXT: v_not_b32_e32 v4, v4 9541; GFX908-NEXT: s_mov_b64 s[4:5], 0 9542; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9543; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9544; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start 9545; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9546; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9547; GFX908-NEXT: v_mov_b32_e32 v6, v5 9548; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9549; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 9550; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 9551; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 9552; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 9553; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9554; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 9555; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9556; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 9557; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9558; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9559; GFX908-NEXT: buffer_wbinvl1 9560; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9561; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9562; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9563; GFX908-NEXT: s_cbranch_execnz .LBB34_1 9564; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9565; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9566; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9567; GFX908-NEXT: s_setpc_b64 s[30:31] 9568; 9569; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9570; GFX8: ; %bb.0: 9571; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9572; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 9573; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9574; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9575; GFX8-NEXT: flat_load_dword v5, v[0:1] 9576; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9577; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9578; GFX8-NEXT: s_mov_b32 s4, 0xffff 9579; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9580; GFX8-NEXT: v_not_b32_e32 v4, v4 9581; GFX8-NEXT: s_mov_b64 s[4:5], 0 9582; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9583; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start 9584; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9585; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9586; GFX8-NEXT: v_mov_b32_e32 v6, v5 9587; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9588; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 9589; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 9590; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 9591; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9592; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 9593; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9594; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 9595; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 9596; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9597; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 9598; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9599; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9600; GFX8-NEXT: buffer_wbinvl1 9601; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9602; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9603; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9604; GFX8-NEXT: s_cbranch_execnz .LBB34_1 9605; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9606; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9607; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9608; GFX8-NEXT: s_setpc_b64 s[30:31] 9609; 9610; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: 9611; GFX7: ; %bb.0: 9612; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9613; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 9614; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9615; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9616; GFX7-NEXT: flat_load_dword v5, v[0:1] 9617; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9618; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9619; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 9620; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9621; GFX7-NEXT: v_not_b32_e32 v4, v4 9622; GFX7-NEXT: s_mov_b64 s[4:5], 0 9623; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9624; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start 9625; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9626; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9627; GFX7-NEXT: v_mov_b32_e32 v6, v5 9628; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9629; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9630; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 9631; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9632; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 9633; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9634; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 9635; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9636; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9637; GFX7-NEXT: buffer_wbinvl1 9638; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9639; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9640; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9641; GFX7-NEXT: s_cbranch_execnz .LBB34_1 9642; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9643; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9644; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9645; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9646; GFX7-NEXT: s_setpc_b64 s[30:31] 9647 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 9648 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst 9649 ret bfloat %result 9650 } 9651 9652define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { 9653; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16: 9654; GFX12: ; %bb.0: 9655; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9656; GFX12-NEXT: s_wait_expcnt 0x0 9657; GFX12-NEXT: s_wait_samplecnt 0x0 9658; GFX12-NEXT: s_wait_bvhcnt 0x0 9659; GFX12-NEXT: s_wait_kmcnt 0x0 9660; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 9661; GFX12-NEXT: s_mov_b32 s0, 0 9662; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9663; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9664; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9665; GFX12-NEXT: flat_load_b32 v4, v[0:1] 9666; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9667; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9668; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9669; GFX12-NEXT: v_not_b32_e32 v6, v3 9670; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start 9671; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9672; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9673; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9674; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9675; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9676; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 9677; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 9678; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 9679; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 9680; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9681; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 9682; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9683; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 9684; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9685; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9686; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9687; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 9688; GFX12-NEXT: s_wait_storecnt 0x0 9689; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9690; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9691; GFX12-NEXT: global_inv scope:SCOPE_DEV 9692; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9693; GFX12-NEXT: v_mov_b32_e32 v4, v3 9694; GFX12-NEXT: s_wait_alu 0xfffe 9695; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9696; GFX12-NEXT: s_wait_alu 0xfffe 9697; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9698; GFX12-NEXT: s_cbranch_execnz .LBB35_1 9699; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9700; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9701; GFX12-NEXT: s_wait_alu 0xfffe 9702; GFX12-NEXT: s_setpc_b64 s[30:31] 9703; 9704; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: 9705; GFX940: ; %bb.0: 9706; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9707; GFX940-NEXT: v_mov_b32_e32 v3, v0 9708; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 9709; GFX940-NEXT: flat_load_dword v5, v[0:1] 9710; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 9711; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9712; GFX940-NEXT: s_mov_b32 s0, 0xffff 9713; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9714; GFX940-NEXT: v_not_b32_e32 v6, v4 9715; GFX940-NEXT: s_mov_b64 s[0:1], 0 9716; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9717; GFX940-NEXT: s_movk_i32 s2, 0x7fff 9718; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start 9719; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9720; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9721; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9722; GFX940-NEXT: s_nop 0 9723; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 9724; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 9725; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 9726; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 9727; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9728; GFX940-NEXT: s_nop 1 9729; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 9730; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9731; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 9732; GFX940-NEXT: buffer_wbl2 sc1 9733; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 9734; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9735; GFX940-NEXT: buffer_inv sc1 9736; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9737; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9738; GFX940-NEXT: v_mov_b32_e32 v5, v4 9739; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9740; GFX940-NEXT: s_cbranch_execnz .LBB35_1 9741; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9742; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9743; GFX940-NEXT: s_setpc_b64 s[30:31] 9744; 9745; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16: 9746; GFX11: ; %bb.0: 9747; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9748; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 9749; GFX11-NEXT: s_mov_b32 s0, 0 9750; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9751; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9752; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9753; GFX11-NEXT: flat_load_b32 v4, v[0:1] 9754; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9755; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9756; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9757; GFX11-NEXT: v_not_b32_e32 v6, v3 9758; GFX11-NEXT: .p2align 6 9759; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start 9760; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9761; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9762; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9763; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9764; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9765; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 9766; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 9767; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 9768; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 9769; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9770; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 9771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9772; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 9773; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9775; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9776; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 9777; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9778; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 9779; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9780; GFX11-NEXT: buffer_gl1_inv 9781; GFX11-NEXT: buffer_gl0_inv 9782; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9783; GFX11-NEXT: v_mov_b32_e32 v4, v3 9784; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9785; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9786; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9787; GFX11-NEXT: s_cbranch_execnz .LBB35_1 9788; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9789; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9790; GFX11-NEXT: s_setpc_b64 s[30:31] 9791; 9792; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16: 9793; GFX10: ; %bb.0: 9794; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9795; GFX10-NEXT: v_mov_b32_e32 v3, v0 9796; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9797; GFX10-NEXT: s_mov_b32 s4, 0 9798; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9799; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9800; GFX10-NEXT: flat_load_dword v4, v[0:1] 9801; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9802; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9803; GFX10-NEXT: v_not_b32_e32 v6, v3 9804; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start 9805; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9806; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9807; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9808; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 9809; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 9810; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 9811; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9812; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 9813; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 9814; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9815; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 9816; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9817; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9818; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9819; GFX10-NEXT: buffer_gl1_inv 9820; GFX10-NEXT: buffer_gl0_inv 9821; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 9822; GFX10-NEXT: v_mov_b32_e32 v4, v3 9823; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9824; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9825; GFX10-NEXT: s_cbranch_execnz .LBB35_1 9826; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9827; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9828; GFX10-NEXT: s_setpc_b64 s[30:31] 9829; 9830; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16: 9831; GFX90A: ; %bb.0: 9832; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9833; GFX90A-NEXT: v_mov_b32_e32 v3, v0 9834; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9835; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9836; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9837; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9838; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9839; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9840; GFX90A-NEXT: v_not_b32_e32 v6, v4 9841; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9842; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9843; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9844; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start 9845; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9846; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9847; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9848; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 9849; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 9850; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 9851; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 9852; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9853; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 9854; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9855; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 9856; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 9857; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9858; GFX90A-NEXT: buffer_wbinvl1 9859; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 9860; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9861; GFX90A-NEXT: v_mov_b32_e32 v5, v4 9862; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9863; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 9864; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9865; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9866; GFX90A-NEXT: s_setpc_b64 s[30:31] 9867; 9868; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16: 9869; GFX908: ; %bb.0: 9870; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9871; GFX908-NEXT: v_mov_b32_e32 v3, v0 9872; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9873; GFX908-NEXT: flat_load_dword v4, v[0:1] 9874; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9875; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9876; GFX908-NEXT: s_mov_b32 s4, 0xffff 9877; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9878; GFX908-NEXT: v_not_b32_e32 v6, v3 9879; GFX908-NEXT: s_mov_b64 s[4:5], 0 9880; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9881; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9882; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start 9883; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9884; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9885; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9886; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 9887; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 9888; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 9889; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 9890; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9891; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 9892; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9893; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 9894; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9895; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9896; GFX908-NEXT: buffer_wbinvl1 9897; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9898; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9899; GFX908-NEXT: v_mov_b32_e32 v4, v3 9900; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9901; GFX908-NEXT: s_cbranch_execnz .LBB35_1 9902; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9903; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9904; GFX908-NEXT: s_setpc_b64 s[30:31] 9905; 9906; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16: 9907; GFX8: ; %bb.0: 9908; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9909; GFX8-NEXT: v_mov_b32_e32 v3, v0 9910; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9911; GFX8-NEXT: flat_load_dword v4, v[0:1] 9912; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9913; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9914; GFX8-NEXT: s_mov_b32 s4, 0xffff 9915; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 9916; GFX8-NEXT: v_not_b32_e32 v6, v3 9917; GFX8-NEXT: s_mov_b64 s[4:5], 0 9918; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9919; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start 9920; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9921; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9922; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9923; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 9924; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 9925; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 9926; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9927; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 9928; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9929; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc 9930; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 9931; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9932; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 9933; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9934; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9935; GFX8-NEXT: buffer_wbinvl1 9936; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9937; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9938; GFX8-NEXT: v_mov_b32_e32 v4, v3 9939; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9940; GFX8-NEXT: s_cbranch_execnz .LBB35_1 9941; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9942; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9943; GFX8-NEXT: s_setpc_b64 s[30:31] 9944; 9945; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16: 9946; GFX7: ; %bb.0: 9947; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9948; GFX7-NEXT: v_mov_b32_e32 v3, v0 9949; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9950; GFX7-NEXT: flat_load_dword v4, v[0:1] 9951; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9952; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9953; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 9954; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9955; GFX7-NEXT: v_not_b32_e32 v6, v3 9956; GFX7-NEXT: s_mov_b64 s[4:5], 0 9957; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9958; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start 9959; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9960; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9961; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9962; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9963; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 9964; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9965; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 9966; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 9967; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 9968; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 9969; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9970; GFX7-NEXT: buffer_wbinvl1 9971; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 9972; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9973; GFX7-NEXT: v_mov_b32_e32 v4, v3 9974; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9975; GFX7-NEXT: s_cbranch_execnz .LBB35_1 9976; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9977; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9978; GFX7-NEXT: s_setpc_b64 s[30:31] 9979 %unused = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst 9980 ret void 9981} 9982 9983define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { 9984; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 9985; GFX12: ; %bb.0: 9986; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9987; GFX12-NEXT: s_wait_expcnt 0x0 9988; GFX12-NEXT: s_wait_samplecnt 0x0 9989; GFX12-NEXT: s_wait_bvhcnt 0x0 9990; GFX12-NEXT: s_wait_kmcnt 0x0 9991; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 9992; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9993; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 9994; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9995; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 9996; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 9997; GFX12-NEXT: s_mov_b32 s0, 0 9998; GFX12-NEXT: flat_load_b32 v3, v[0:1] 9999; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10000; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10001; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10002; GFX12-NEXT: v_not_b32_e32 v5, v5 10003; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start 10004; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10005; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10006; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10007; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10008; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10009; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 10010; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10011; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 10012; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 10013; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10014; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10015; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10016; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10017; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10018; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10019; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10020; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 10021; GFX12-NEXT: s_wait_storecnt 0x0 10022; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10023; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10024; GFX12-NEXT: global_inv scope:SCOPE_DEV 10025; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10026; GFX12-NEXT: v_mov_b32_e32 v3, v2 10027; GFX12-NEXT: s_wait_alu 0xfffe 10028; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10029; GFX12-NEXT: s_wait_alu 0xfffe 10030; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10031; GFX12-NEXT: s_cbranch_execnz .LBB36_1 10032; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10033; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10034; GFX12-NEXT: s_wait_alu 0xfffe 10035; GFX12-NEXT: s_setpc_b64 s[30:31] 10036; 10037; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10038; GFX940: ; %bb.0: 10039; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10040; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 10041; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 10042; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 10043; GFX940-NEXT: v_mov_b32_e32 v1, v5 10044; GFX940-NEXT: flat_load_dword v3, v[0:1] 10045; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 10046; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10047; GFX940-NEXT: s_mov_b32 s0, 0xffff 10048; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 10049; GFX940-NEXT: v_not_b32_e32 v5, v5 10050; GFX940-NEXT: s_mov_b64 s[0:1], 0 10051; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10052; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10053; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start 10054; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10055; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10056; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10057; GFX940-NEXT: s_nop 0 10058; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 10059; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 10060; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 10061; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 10062; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10063; GFX940-NEXT: s_nop 1 10064; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10065; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10066; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 10067; GFX940-NEXT: buffer_wbl2 sc1 10068; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 10069; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10070; GFX940-NEXT: buffer_inv sc1 10071; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10072; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10073; GFX940-NEXT: v_mov_b32_e32 v3, v2 10074; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10075; GFX940-NEXT: s_cbranch_execnz .LBB36_1 10076; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10077; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10078; GFX940-NEXT: s_setpc_b64 s[30:31] 10079; 10080; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10081; GFX11: ; %bb.0: 10082; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10083; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 10084; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10085; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10086; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10087; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 10088; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 10089; GFX11-NEXT: s_mov_b32 s0, 0 10090; GFX11-NEXT: flat_load_b32 v3, v[0:1] 10091; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10092; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10094; GFX11-NEXT: v_not_b32_e32 v5, v5 10095; GFX11-NEXT: .p2align 6 10096; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start 10097; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10098; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10099; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10100; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10101; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10102; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 10103; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10104; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 10105; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 10106; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10107; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10108; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10109; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10110; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10111; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10112; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10113; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 10114; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10115; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 10116; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10117; GFX11-NEXT: buffer_gl1_inv 10118; GFX11-NEXT: buffer_gl0_inv 10119; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10120; GFX11-NEXT: v_mov_b32_e32 v3, v2 10121; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10122; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10123; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10124; GFX11-NEXT: s_cbranch_execnz .LBB36_1 10125; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10126; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10127; GFX11-NEXT: s_setpc_b64 s[30:31] 10128; 10129; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10130; GFX10: ; %bb.0: 10131; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10132; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 10133; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10134; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10135; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 10136; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 10137; GFX10-NEXT: s_mov_b32 s4, 0 10138; GFX10-NEXT: flat_load_dword v3, v[0:1] 10139; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10140; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10141; GFX10-NEXT: v_not_b32_e32 v5, v5 10142; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start 10143; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10144; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10145; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10146; GFX10-NEXT: v_sub_f32_e32 v2, v2, v6 10147; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 10148; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 10149; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10150; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10151; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10152; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10153; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 10154; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10155; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10156; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10157; GFX10-NEXT: buffer_gl1_inv 10158; GFX10-NEXT: buffer_gl0_inv 10159; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10160; GFX10-NEXT: v_mov_b32_e32 v3, v2 10161; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10162; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10163; GFX10-NEXT: s_cbranch_execnz .LBB36_1 10164; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10165; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10166; GFX10-NEXT: s_setpc_b64 s[30:31] 10167; 10168; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10169; GFX90A: ; %bb.0: 10170; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10171; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 10172; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10173; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 10174; GFX90A-NEXT: flat_load_dword v3, v[0:1] 10175; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 10176; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10177; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10178; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10179; GFX90A-NEXT: v_not_b32_e32 v5, v5 10180; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10181; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10182; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10183; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start 10184; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10185; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10186; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10187; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 10188; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 10189; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 10190; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 10191; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10192; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10193; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10194; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 10195; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10196; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10197; GFX90A-NEXT: buffer_wbinvl1 10198; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10199; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10200; GFX90A-NEXT: v_mov_b32_e32 v3, v2 10201; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10202; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 10203; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10204; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10205; GFX90A-NEXT: s_setpc_b64 s[30:31] 10206; 10207; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10208; GFX908: ; %bb.0: 10209; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10210; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 10211; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10212; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 10213; GFX908-NEXT: flat_load_dword v3, v[0:1] 10214; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 10215; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10216; GFX908-NEXT: s_mov_b32 s4, 0xffff 10217; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10218; GFX908-NEXT: v_not_b32_e32 v5, v5 10219; GFX908-NEXT: s_mov_b64 s[4:5], 0 10220; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10221; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10222; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start 10223; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10224; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10225; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10226; GFX908-NEXT: v_sub_f32_e32 v2, v2, v6 10227; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 10228; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 10229; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 10230; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10231; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10232; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10233; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 10234; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10235; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10236; GFX908-NEXT: buffer_wbinvl1 10237; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10238; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10239; GFX908-NEXT: v_mov_b32_e32 v3, v2 10240; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10241; GFX908-NEXT: s_cbranch_execnz .LBB36_1 10242; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10243; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10244; GFX908-NEXT: s_setpc_b64 s[30:31] 10245; 10246; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10247; GFX8: ; %bb.0: 10248; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10249; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 10250; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10251; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 10252; GFX8-NEXT: flat_load_dword v3, v[0:1] 10253; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 10254; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10255; GFX8-NEXT: s_mov_b32 s4, 0xffff 10256; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10257; GFX8-NEXT: v_not_b32_e32 v5, v5 10258; GFX8-NEXT: s_mov_b64 s[4:5], 0 10259; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10260; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start 10261; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10262; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10263; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10264; GFX8-NEXT: v_sub_f32_e32 v2, v2, v6 10265; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 10266; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 10267; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10268; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 10269; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10270; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 10271; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 10272; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10273; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 10274; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10275; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10276; GFX8-NEXT: buffer_wbinvl1 10277; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10278; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10279; GFX8-NEXT: v_mov_b32_e32 v3, v2 10280; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10281; GFX8-NEXT: s_cbranch_execnz .LBB36_1 10282; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10283; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10284; GFX8-NEXT: s_setpc_b64 s[30:31] 10285; 10286; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: 10287; GFX7: ; %bb.0: 10288; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10289; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 10290; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10291; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 10292; GFX7-NEXT: flat_load_dword v3, v[0:1] 10293; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 10294; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10295; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 10296; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10297; GFX7-NEXT: v_not_b32_e32 v5, v5 10298; GFX7-NEXT: s_mov_b64 s[4:5], 0 10299; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 10300; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start 10301; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10302; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10303; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10304; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10305; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 10306; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10307; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 10308; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10309; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 10310; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10311; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10312; GFX7-NEXT: buffer_wbinvl1 10313; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10314; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10315; GFX7-NEXT: v_mov_b32_e32 v3, v2 10316; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10317; GFX7-NEXT: s_cbranch_execnz .LBB36_1 10318; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10319; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10320; GFX7-NEXT: s_setpc_b64 s[30:31] 10321 %gep = getelementptr bfloat, ptr %ptr, i64 1023 10322 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst 10323 ret void 10324} 10325 10326define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 { 10327; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10328; GFX12: ; %bb.0: 10329; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10330; GFX12-NEXT: s_wait_expcnt 0x0 10331; GFX12-NEXT: s_wait_samplecnt 0x0 10332; GFX12-NEXT: s_wait_bvhcnt 0x0 10333; GFX12-NEXT: s_wait_kmcnt 0x0 10334; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10335; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10336; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10337; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10338; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 10339; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 10340; GFX12-NEXT: s_mov_b32 s0, 0 10341; GFX12-NEXT: flat_load_b32 v3, v[0:1] 10342; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10343; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10344; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10345; GFX12-NEXT: v_not_b32_e32 v5, v5 10346; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start 10347; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10348; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10349; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10350; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10351; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10352; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 10353; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10354; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 10355; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 10356; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10357; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10358; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10359; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10360; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10361; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10362; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10363; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 10364; GFX12-NEXT: s_wait_storecnt 0x0 10365; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10366; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10367; GFX12-NEXT: global_inv scope:SCOPE_DEV 10368; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10369; GFX12-NEXT: v_mov_b32_e32 v3, v2 10370; GFX12-NEXT: s_wait_alu 0xfffe 10371; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10372; GFX12-NEXT: s_wait_alu 0xfffe 10373; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10374; GFX12-NEXT: s_cbranch_execnz .LBB37_1 10375; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10376; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10377; GFX12-NEXT: s_wait_alu 0xfffe 10378; GFX12-NEXT: s_setpc_b64 s[30:31] 10379; 10380; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10381; GFX940: ; %bb.0: 10382; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10383; GFX940-NEXT: s_movk_i32 s0, 0xf800 10384; GFX940-NEXT: s_mov_b32 s1, -1 10385; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 10386; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 10387; GFX940-NEXT: v_mov_b32_e32 v1, v5 10388; GFX940-NEXT: flat_load_dword v3, v[0:1] 10389; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 10390; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10391; GFX940-NEXT: s_mov_b32 s0, 0xffff 10392; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 10393; GFX940-NEXT: v_not_b32_e32 v5, v5 10394; GFX940-NEXT: s_mov_b64 s[0:1], 0 10395; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10396; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10397; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start 10398; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10399; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10400; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10401; GFX940-NEXT: s_nop 0 10402; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 10403; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 10404; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 10405; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 10406; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10407; GFX940-NEXT: s_nop 1 10408; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10409; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10410; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 10411; GFX940-NEXT: buffer_wbl2 sc1 10412; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 10413; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10414; GFX940-NEXT: buffer_inv sc1 10415; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10416; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10417; GFX940-NEXT: v_mov_b32_e32 v3, v2 10418; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10419; GFX940-NEXT: s_cbranch_execnz .LBB37_1 10420; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10421; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10422; GFX940-NEXT: s_setpc_b64 s[30:31] 10423; 10424; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10425; GFX11: ; %bb.0: 10426; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10427; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10428; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10429; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10430; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10431; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 10432; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 10433; GFX11-NEXT: s_mov_b32 s0, 0 10434; GFX11-NEXT: flat_load_b32 v3, v[0:1] 10435; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10436; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10437; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10438; GFX11-NEXT: v_not_b32_e32 v5, v5 10439; GFX11-NEXT: .p2align 6 10440; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start 10441; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10442; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10443; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10444; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10445; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10446; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 10447; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10448; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 10449; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 10450; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10451; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10452; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10453; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10454; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10455; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10456; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10457; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 10458; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10459; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 10460; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10461; GFX11-NEXT: buffer_gl1_inv 10462; GFX11-NEXT: buffer_gl0_inv 10463; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10464; GFX11-NEXT: v_mov_b32_e32 v3, v2 10465; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10466; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10467; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10468; GFX11-NEXT: s_cbranch_execnz .LBB37_1 10469; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10470; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10471; GFX11-NEXT: s_setpc_b64 s[30:31] 10472; 10473; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10474; GFX10: ; %bb.0: 10475; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10476; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10477; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10478; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10479; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 10480; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 10481; GFX10-NEXT: s_mov_b32 s4, 0 10482; GFX10-NEXT: flat_load_dword v3, v[0:1] 10483; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10484; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10485; GFX10-NEXT: v_not_b32_e32 v5, v5 10486; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start 10487; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10488; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10489; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10490; GFX10-NEXT: v_sub_f32_e32 v2, v2, v6 10491; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 10492; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 10493; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10494; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10495; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10496; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10497; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 10498; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10499; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10500; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10501; GFX10-NEXT: buffer_gl1_inv 10502; GFX10-NEXT: buffer_gl0_inv 10503; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10504; GFX10-NEXT: v_mov_b32_e32 v3, v2 10505; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10506; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10507; GFX10-NEXT: s_cbranch_execnz .LBB37_1 10508; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10509; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10510; GFX10-NEXT: s_setpc_b64 s[30:31] 10511; 10512; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10513; GFX90A: ; %bb.0: 10514; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10515; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 10516; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 10517; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 10518; GFX90A-NEXT: flat_load_dword v3, v[0:1] 10519; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 10520; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10521; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10522; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10523; GFX90A-NEXT: v_not_b32_e32 v5, v5 10524; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10525; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10526; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10527; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start 10528; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10529; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10530; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10531; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 10532; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 10533; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 10534; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 10535; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10536; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10537; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10538; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 10539; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10540; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10541; GFX90A-NEXT: buffer_wbinvl1 10542; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10543; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10544; GFX90A-NEXT: v_mov_b32_e32 v3, v2 10545; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10546; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 10547; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10548; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10549; GFX90A-NEXT: s_setpc_b64 s[30:31] 10550; 10551; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10552; GFX908: ; %bb.0: 10553; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10554; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 10555; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 10556; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 10557; GFX908-NEXT: flat_load_dword v3, v[0:1] 10558; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 10559; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10560; GFX908-NEXT: s_mov_b32 s4, 0xffff 10561; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10562; GFX908-NEXT: v_not_b32_e32 v5, v5 10563; GFX908-NEXT: s_mov_b64 s[4:5], 0 10564; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10565; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10566; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start 10567; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10568; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10569; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10570; GFX908-NEXT: v_sub_f32_e32 v2, v2, v6 10571; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 10572; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 10573; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 10574; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10575; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10576; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10577; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 10578; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10579; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10580; GFX908-NEXT: buffer_wbinvl1 10581; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10582; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10583; GFX908-NEXT: v_mov_b32_e32 v3, v2 10584; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10585; GFX908-NEXT: s_cbranch_execnz .LBB37_1 10586; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10587; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10588; GFX908-NEXT: s_setpc_b64 s[30:31] 10589; 10590; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10591; GFX8: ; %bb.0: 10592; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10593; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 10594; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 10595; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 10596; GFX8-NEXT: flat_load_dword v3, v[0:1] 10597; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 10598; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10599; GFX8-NEXT: s_mov_b32 s4, 0xffff 10600; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10601; GFX8-NEXT: v_not_b32_e32 v5, v5 10602; GFX8-NEXT: s_mov_b64 s[4:5], 0 10603; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10604; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start 10605; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10606; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10607; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10608; GFX8-NEXT: v_sub_f32_e32 v2, v2, v6 10609; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 10610; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 10611; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10612; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 10613; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10614; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 10615; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 10616; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10617; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 10618; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10619; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10620; GFX8-NEXT: buffer_wbinvl1 10621; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10622; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10623; GFX8-NEXT: v_mov_b32_e32 v3, v2 10624; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10625; GFX8-NEXT: s_cbranch_execnz .LBB37_1 10626; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10627; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10628; GFX8-NEXT: s_setpc_b64 s[30:31] 10629; 10630; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: 10631; GFX7: ; %bb.0: 10632; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10633; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 10634; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 10635; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 10636; GFX7-NEXT: flat_load_dword v3, v[0:1] 10637; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 10638; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10639; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 10640; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10641; GFX7-NEXT: v_not_b32_e32 v5, v5 10642; GFX7-NEXT: s_mov_b64 s[4:5], 0 10643; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 10644; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start 10645; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10646; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10647; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10648; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10649; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 10650; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10651; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 10652; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10653; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 10654; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10655; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10656; GFX7-NEXT: buffer_wbinvl1 10657; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10658; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10659; GFX7-NEXT: v_mov_b32_e32 v3, v2 10660; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10661; GFX7-NEXT: s_cbranch_execnz .LBB37_1 10662; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10663; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10664; GFX7-NEXT: s_setpc_b64 s[30:31] 10665 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 10666 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst 10667 ret void 10668} 10669 10670define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 { 10671; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10672; GFX12: ; %bb.0: 10673; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10674; GFX12-NEXT: s_wait_expcnt 0x0 10675; GFX12-NEXT: s_wait_samplecnt 0x0 10676; GFX12-NEXT: s_wait_bvhcnt 0x0 10677; GFX12-NEXT: s_wait_kmcnt 0x0 10678; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 10679; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10680; GFX12-NEXT: s_mov_b32 s0, 0 10681; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start 10682; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10683; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10684; GFX12-NEXT: v_mov_b32_e32 v4, v3 10685; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10686; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 10687; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 10688; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10689; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 10690; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 10691; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10692; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 10693; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10694; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 10695; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10696; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10697; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 10698; GFX12-NEXT: s_wait_storecnt 0x0 10699; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10700; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10701; GFX12-NEXT: global_inv scope:SCOPE_DEV 10702; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10703; GFX12-NEXT: s_wait_alu 0xfffe 10704; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10705; GFX12-NEXT: s_wait_alu 0xfffe 10706; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10707; GFX12-NEXT: s_cbranch_execnz .LBB38_1 10708; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10709; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10710; GFX12-NEXT: v_mov_b32_e32 v0, v3 10711; GFX12-NEXT: s_wait_alu 0xfffe 10712; GFX12-NEXT: s_setpc_b64 s[30:31] 10713; 10714; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10715; GFX940: ; %bb.0: 10716; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10717; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 10718; GFX940-NEXT: s_mov_b64 s[0:1], 0 10719; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10720; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10721; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 10722; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start 10723; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10724; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10725; GFX940-NEXT: v_mov_b32_e32 v5, v3 10726; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 10727; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 10728; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 10729; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 10730; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 10731; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10732; GFX940-NEXT: s_nop 1 10733; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 10734; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10735; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 10736; GFX940-NEXT: buffer_wbl2 sc1 10737; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 10738; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10739; GFX940-NEXT: buffer_inv sc1 10740; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 10741; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10742; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10743; GFX940-NEXT: s_cbranch_execnz .LBB38_1 10744; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10745; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10746; GFX940-NEXT: v_mov_b32_e32 v0, v3 10747; GFX940-NEXT: s_setpc_b64 s[30:31] 10748; 10749; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10750; GFX11: ; %bb.0: 10751; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10752; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 10753; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10754; GFX11-NEXT: s_mov_b32 s0, 0 10755; GFX11-NEXT: .p2align 6 10756; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start 10757; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10758; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10759; GFX11-NEXT: v_mov_b32_e32 v4, v3 10760; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10761; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 10762; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 10763; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10764; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 10765; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 10766; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10767; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 10768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10769; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 10770; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10772; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 10773; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10774; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 10775; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10776; GFX11-NEXT: buffer_gl1_inv 10777; GFX11-NEXT: buffer_gl0_inv 10778; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10779; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10780; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10781; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10782; GFX11-NEXT: s_cbranch_execnz .LBB38_1 10783; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10784; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10785; GFX11-NEXT: v_mov_b32_e32 v0, v3 10786; GFX11-NEXT: s_setpc_b64 s[30:31] 10787; 10788; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10789; GFX10: ; %bb.0: 10790; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10791; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 10792; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 10793; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 10794; GFX10-NEXT: s_mov_b32 s4, 0 10795; GFX10-NEXT: flat_load_dword v0, v[3:4] 10796; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start 10797; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10798; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10799; GFX10-NEXT: v_mov_b32_e32 v6, v0 10800; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 10801; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 10802; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 10803; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 10804; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 10805; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 10806; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo 10807; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 10808; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 10809; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10810; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 10811; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10812; GFX10-NEXT: buffer_gl1_inv 10813; GFX10-NEXT: buffer_gl0_inv 10814; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 10815; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10816; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10817; GFX10-NEXT: s_cbranch_execnz .LBB38_1 10818; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10819; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10820; GFX10-NEXT: s_setpc_b64 s[30:31] 10821; 10822; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10823; GFX90A: ; %bb.0: 10824; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10825; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 10826; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10827; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10828; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10829; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 10830; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start 10831; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10832; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10833; GFX90A-NEXT: v_mov_b32_e32 v5, v3 10834; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 10835; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 10836; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 10837; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 10838; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 10839; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10840; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 10841; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10842; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 10843; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 10844; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10845; GFX90A-NEXT: buffer_wbinvl1 10846; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 10847; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10848; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10849; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 10850; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10851; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10852; GFX90A-NEXT: v_mov_b32_e32 v0, v3 10853; GFX90A-NEXT: s_setpc_b64 s[30:31] 10854; 10855; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10856; GFX908: ; %bb.0: 10857; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10858; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 10859; GFX908-NEXT: s_mov_b64 s[4:5], 0 10860; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10861; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10862; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 10863; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start 10864; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10865; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10866; GFX908-NEXT: v_mov_b32_e32 v4, v3 10867; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 10868; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 10869; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 10870; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 10871; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 10872; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10873; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 10874; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10875; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 10876; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 10877; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10878; GFX908-NEXT: buffer_wbinvl1 10879; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10880; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10881; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10882; GFX908-NEXT: s_cbranch_execnz .LBB38_1 10883; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10884; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10885; GFX908-NEXT: v_mov_b32_e32 v0, v3 10886; GFX908-NEXT: s_setpc_b64 s[30:31] 10887; 10888; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10889; GFX8: ; %bb.0: 10890; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10891; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 10892; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 10893; GFX8-NEXT: flat_load_dword v0, v[3:4] 10894; GFX8-NEXT: s_mov_b64 s[4:5], 0 10895; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 10896; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start 10897; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10898; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10899; GFX8-NEXT: v_mov_b32_e32 v6, v0 10900; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 10901; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 10902; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 10903; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 10904; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 10905; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 10906; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 10907; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 10908; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc 10909; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10910; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 10911; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10912; GFX8-NEXT: buffer_wbinvl1 10913; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 10914; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10915; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10916; GFX8-NEXT: s_cbranch_execnz .LBB38_1 10917; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10918; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10919; GFX8-NEXT: s_setpc_b64 s[30:31] 10920; 10921; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: 10922; GFX7: ; %bb.0: 10923; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10924; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 10925; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10926; GFX7-NEXT: flat_load_dword v3, v[0:1] 10927; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10928; GFX7-NEXT: s_mov_b64 s[4:5], 0 10929; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10930; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start 10931; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10932; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10933; GFX7-NEXT: v_mov_b32_e32 v4, v3 10934; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 10935; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 10936; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 10937; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10938; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 10939; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10940; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10941; GFX7-NEXT: buffer_wbinvl1 10942; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10943; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10944; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10945; GFX7-NEXT: s_cbranch_execnz .LBB38_1 10946; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10947; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10948; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 10949; GFX7-NEXT: s_setpc_b64 s[30:31] 10950 %gep = getelementptr bfloat, ptr %ptr, i64 1023 10951 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 10952 ret bfloat %result 10953} 10954 10955define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 { 10956; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 10957; GFX12: ; %bb.0: 10958; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10959; GFX12-NEXT: s_wait_expcnt 0x0 10960; GFX12-NEXT: s_wait_samplecnt 0x0 10961; GFX12-NEXT: s_wait_bvhcnt 0x0 10962; GFX12-NEXT: s_wait_kmcnt 0x0 10963; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 10964; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 10965; GFX12-NEXT: s_mov_b32 s0, 0 10966; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start 10967; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10968; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10969; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 10970; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10971; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 10972; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 10973; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 10974; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10975; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 10976; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 10977; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 10978; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10979; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10980; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 10981; GFX12-NEXT: s_wait_storecnt 0x0 10982; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10983; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10984; GFX12-NEXT: global_inv scope:SCOPE_DEV 10985; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10986; GFX12-NEXT: v_mov_b32_e32 v3, v2 10987; GFX12-NEXT: s_wait_alu 0xfffe 10988; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10989; GFX12-NEXT: s_wait_alu 0xfffe 10990; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10991; GFX12-NEXT: s_cbranch_execnz .LBB39_1 10992; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10993; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10994; GFX12-NEXT: s_wait_alu 0xfffe 10995; GFX12-NEXT: s_setpc_b64 s[30:31] 10996; 10997; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 10998; GFX940: ; %bb.0: 10999; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11000; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 11001; GFX940-NEXT: s_mov_b64 s[0:1], 0 11002; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11003; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11004; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 11005; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start 11006; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11007; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11008; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11009; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 11010; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 11011; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 11012; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 11013; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11014; GFX940-NEXT: s_nop 1 11015; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11016; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11017; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 11018; GFX940-NEXT: buffer_wbl2 sc1 11019; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 11020; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11021; GFX940-NEXT: buffer_inv sc1 11022; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11023; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11024; GFX940-NEXT: v_mov_b32_e32 v3, v2 11025; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11026; GFX940-NEXT: s_cbranch_execnz .LBB39_1 11027; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11028; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11029; GFX940-NEXT: s_setpc_b64 s[30:31] 11030; 11031; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11032; GFX11: ; %bb.0: 11033; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11034; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 11035; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11036; GFX11-NEXT: s_mov_b32 s0, 0 11037; GFX11-NEXT: .p2align 6 11038; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start 11039; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11040; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11041; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11043; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 11044; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 11045; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 11046; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11047; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11048; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 11049; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 11050; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11051; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11052; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 11053; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11054; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc 11055; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11056; GFX11-NEXT: buffer_gl1_inv 11057; GFX11-NEXT: buffer_gl0_inv 11058; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11059; GFX11-NEXT: v_mov_b32_e32 v3, v2 11060; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11061; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11062; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11063; GFX11-NEXT: s_cbranch_execnz .LBB39_1 11064; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11065; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11066; GFX11-NEXT: s_setpc_b64 s[30:31] 11067; 11068; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11069; GFX10: ; %bb.0: 11070; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11071; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 11072; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11073; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11074; GFX10-NEXT: s_mov_b32 s4, 0 11075; GFX10-NEXT: flat_load_dword v3, v[0:1] 11076; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start 11077; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11078; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11079; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11080; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 11081; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 11082; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 11083; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11084; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 11085; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 11086; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11087; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 11088; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11089; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11090; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11091; GFX10-NEXT: buffer_gl1_inv 11092; GFX10-NEXT: buffer_gl0_inv 11093; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11094; GFX10-NEXT: v_mov_b32_e32 v3, v2 11095; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11096; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11097; GFX10-NEXT: s_cbranch_execnz .LBB39_1 11098; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11099; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11100; GFX10-NEXT: s_setpc_b64 s[30:31] 11101; 11102; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11103; GFX90A: ; %bb.0: 11104; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11105; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 11106; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11107; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11108; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11109; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 11110; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start 11111; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11112; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11113; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11114; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 11115; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 11116; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 11117; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 11118; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11119; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11120; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11121; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 11122; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 11123; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11124; GFX90A-NEXT: buffer_wbinvl1 11125; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11126; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11127; GFX90A-NEXT: v_mov_b32_e32 v3, v2 11128; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11129; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 11130; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11131; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11132; GFX90A-NEXT: s_setpc_b64 s[30:31] 11133; 11134; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11135; GFX908: ; %bb.0: 11136; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11137; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 11138; GFX908-NEXT: s_mov_b64 s[4:5], 0 11139; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11140; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11141; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 11142; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start 11143; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11144; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11145; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11146; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 11147; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 11148; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 11149; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 11150; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11151; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11152; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11153; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 11154; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 11155; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11156; GFX908-NEXT: buffer_wbinvl1 11157; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11158; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11159; GFX908-NEXT: v_mov_b32_e32 v3, v2 11160; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11161; GFX908-NEXT: s_cbranch_execnz .LBB39_1 11162; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11163; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11164; GFX908-NEXT: s_setpc_b64 s[30:31] 11165; 11166; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11167; GFX8: ; %bb.0: 11168; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11169; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 11170; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11171; GFX8-NEXT: flat_load_dword v3, v[0:1] 11172; GFX8-NEXT: s_mov_b64 s[4:5], 0 11173; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11174; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start 11175; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11176; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11177; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11178; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 11179; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 11180; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 11181; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 11182; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 11183; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11184; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 11185; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 11186; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11187; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11188; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11189; GFX8-NEXT: buffer_wbinvl1 11190; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11191; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11192; GFX8-NEXT: v_mov_b32_e32 v3, v2 11193; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11194; GFX8-NEXT: s_cbranch_execnz .LBB39_1 11195; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11196; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11197; GFX8-NEXT: s_setpc_b64 s[30:31] 11198; 11199; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: 11200; GFX7: ; %bb.0: 11201; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11202; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 11203; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11204; GFX7-NEXT: flat_load_dword v3, v[0:1] 11205; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11206; GFX7-NEXT: s_mov_b64 s[4:5], 0 11207; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 11208; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start 11209; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11210; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11211; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11212; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 11213; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 11214; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11215; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 11216; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11217; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11218; GFX7-NEXT: buffer_wbinvl1 11219; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11220; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11221; GFX7-NEXT: v_mov_b32_e32 v3, v2 11222; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11223; GFX7-NEXT: s_cbranch_execnz .LBB39_1 11224; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11225; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11226; GFX7-NEXT: s_setpc_b64 s[30:31] 11227 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11228 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 11229 ret void 11230} 11231 11232define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { 11233; GFX12-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11234; GFX12: ; %bb.0: 11235; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11236; GFX12-NEXT: s_wait_expcnt 0x0 11237; GFX12-NEXT: s_wait_samplecnt 0x0 11238; GFX12-NEXT: s_wait_bvhcnt 0x0 11239; GFX12-NEXT: s_wait_kmcnt 0x0 11240; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11241; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11242; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11243; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11244; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 11245; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 11246; GFX12-NEXT: s_mov_b32 s0, 0 11247; GFX12-NEXT: flat_load_b32 v5, v[0:1] 11248; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11249; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11250; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11251; GFX12-NEXT: v_not_b32_e32 v4, v4 11252; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start 11253; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11254; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11255; GFX12-NEXT: v_mov_b32_e32 v6, v5 11256; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11257; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11258; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11259; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11260; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 11261; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 11262; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 11263; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11264; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11265; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11266; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11267; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11268; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11269; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11270; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11271; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 11272; GFX12-NEXT: global_wb scope:SCOPE_SYS 11273; GFX12-NEXT: s_wait_storecnt 0x0 11274; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 11275; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11276; GFX12-NEXT: global_inv scope:SCOPE_SYS 11277; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11278; GFX12-NEXT: s_wait_alu 0xfffe 11279; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11280; GFX12-NEXT: s_wait_alu 0xfffe 11281; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11282; GFX12-NEXT: s_cbranch_execnz .LBB40_1 11283; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11284; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11285; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11286; GFX12-NEXT: s_wait_alu 0xfffe 11287; GFX12-NEXT: s_setpc_b64 s[30:31] 11288; 11289; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11290; GFX940: ; %bb.0: 11291; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11292; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11293; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11294; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11295; GFX940-NEXT: v_mov_b32_e32 v1, v5 11296; GFX940-NEXT: flat_load_dword v5, v[0:1] 11297; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 11298; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11299; GFX940-NEXT: s_mov_b32 s0, 0xffff 11300; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 11301; GFX940-NEXT: v_not_b32_e32 v4, v4 11302; GFX940-NEXT: s_mov_b64 s[0:1], 0 11303; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11304; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11305; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start 11306; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11307; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11308; GFX940-NEXT: v_mov_b32_e32 v7, v5 11309; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11310; GFX940-NEXT: s_nop 0 11311; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 11312; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 11313; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 11314; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 11315; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11316; GFX940-NEXT: s_nop 1 11317; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11318; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11319; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 11320; GFX940-NEXT: buffer_wbl2 sc0 sc1 11321; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 11322; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11323; GFX940-NEXT: buffer_inv sc0 sc1 11324; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11325; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11326; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11327; GFX940-NEXT: s_cbranch_execnz .LBB40_1 11328; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11329; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11330; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11331; GFX940-NEXT: s_setpc_b64 s[30:31] 11332; 11333; GFX11-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11334; GFX11: ; %bb.0: 11335; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11336; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11337; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11338; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11339; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11340; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 11341; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 11342; GFX11-NEXT: s_mov_b32 s0, 0 11343; GFX11-NEXT: flat_load_b32 v5, v[0:1] 11344; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11345; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11346; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11347; GFX11-NEXT: v_not_b32_e32 v4, v4 11348; GFX11-NEXT: .p2align 6 11349; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start 11350; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11351; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11352; GFX11-NEXT: v_mov_b32_e32 v6, v5 11353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11354; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11355; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11356; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11357; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 11358; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11359; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 11360; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11362; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11363; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11365; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11366; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11367; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11368; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 11369; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11370; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 11371; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11372; GFX11-NEXT: buffer_gl1_inv 11373; GFX11-NEXT: buffer_gl0_inv 11374; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11375; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11376; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11377; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11378; GFX11-NEXT: s_cbranch_execnz .LBB40_1 11379; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11380; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11381; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11382; GFX11-NEXT: s_setpc_b64 s[30:31] 11383; 11384; GFX10-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11385; GFX10: ; %bb.0: 11386; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11387; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11388; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11389; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11390; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 11391; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 11392; GFX10-NEXT: s_mov_b32 s4, 0 11393; GFX10-NEXT: flat_load_dword v5, v[0:1] 11394; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11395; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11396; GFX10-NEXT: v_not_b32_e32 v4, v4 11397; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start 11398; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11399; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11400; GFX10-NEXT: v_mov_b32_e32 v6, v5 11401; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11402; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 11403; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11404; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 11405; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11406; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11407; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11408; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11409; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 11410; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11411; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11412; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11413; GFX10-NEXT: buffer_gl1_inv 11414; GFX10-NEXT: buffer_gl0_inv 11415; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11416; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11417; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11418; GFX10-NEXT: s_cbranch_execnz .LBB40_1 11419; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11420; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11421; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11422; GFX10-NEXT: s_setpc_b64 s[30:31] 11423; 11424; GFX90A-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11425; GFX90A: ; %bb.0: 11426; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11427; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11428; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11429; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 11430; GFX90A-NEXT: flat_load_dword v5, v[0:1] 11431; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 11432; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11433; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11434; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11435; GFX90A-NEXT: v_not_b32_e32 v4, v4 11436; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11437; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11438; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11439; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start 11440; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11441; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11442; GFX90A-NEXT: v_mov_b32_e32 v7, v5 11443; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11444; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 11445; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 11446; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 11447; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 11448; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11449; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11450; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11451; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 11452; GFX90A-NEXT: buffer_wbl2 11453; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 11454; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11455; GFX90A-NEXT: buffer_invl2 11456; GFX90A-NEXT: buffer_wbinvl1 11457; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11458; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11459; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11460; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 11461; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11462; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11463; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11464; GFX90A-NEXT: s_setpc_b64 s[30:31] 11465; 11466; GFX908-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11467; GFX908: ; %bb.0: 11468; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11469; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11470; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11471; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 11472; GFX908-NEXT: flat_load_dword v5, v[0:1] 11473; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 11474; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11475; GFX908-NEXT: s_mov_b32 s4, 0xffff 11476; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11477; GFX908-NEXT: v_not_b32_e32 v4, v4 11478; GFX908-NEXT: s_mov_b64 s[4:5], 0 11479; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11480; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11481; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start 11482; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11483; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11484; GFX908-NEXT: v_mov_b32_e32 v6, v5 11485; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11486; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 11487; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 11488; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 11489; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 11490; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11491; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 11492; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11493; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 11494; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11495; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11496; GFX908-NEXT: buffer_wbinvl1 11497; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11498; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11499; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11500; GFX908-NEXT: s_cbranch_execnz .LBB40_1 11501; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11502; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11503; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11504; GFX908-NEXT: s_setpc_b64 s[30:31] 11505; 11506; GFX8-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11507; GFX8: ; %bb.0: 11508; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11509; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 11510; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11511; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 11512; GFX8-NEXT: flat_load_dword v5, v[0:1] 11513; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 11514; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11515; GFX8-NEXT: s_mov_b32 s4, 0xffff 11516; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11517; GFX8-NEXT: v_not_b32_e32 v4, v4 11518; GFX8-NEXT: s_mov_b64 s[4:5], 0 11519; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11520; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start 11521; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11522; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11523; GFX8-NEXT: v_mov_b32_e32 v6, v5 11524; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11525; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 11526; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11527; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11528; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11529; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11530; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11531; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11532; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 11533; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11534; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 11535; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11536; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11537; GFX8-NEXT: buffer_wbinvl1 11538; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11539; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11540; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11541; GFX8-NEXT: s_cbranch_execnz .LBB40_1 11542; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11543; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11544; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11545; GFX8-NEXT: s_setpc_b64 s[30:31] 11546; 11547; GFX7-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: 11548; GFX7: ; %bb.0: 11549; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11550; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 11551; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11552; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 11553; GFX7-NEXT: flat_load_dword v5, v[0:1] 11554; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 11555; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11556; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 11557; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11558; GFX7-NEXT: v_not_b32_e32 v4, v4 11559; GFX7-NEXT: s_mov_b64 s[4:5], 0 11560; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11561; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start 11562; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11563; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11564; GFX7-NEXT: v_mov_b32_e32 v6, v5 11565; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11566; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11567; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 11568; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11569; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 11570; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11571; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 11572; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11573; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11574; GFX7-NEXT: buffer_wbinvl1 11575; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11576; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11577; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11578; GFX7-NEXT: s_cbranch_execnz .LBB40_1 11579; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11580; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11581; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11582; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11583; GFX7-NEXT: s_setpc_b64 s[30:31] 11584 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11585 %result = atomicrmw fsub ptr %gep, bfloat %val seq_cst 11586 ret bfloat %result 11587} 11588 11589define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { 11590; GFX12-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11591; GFX12: ; %bb.0: 11592; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11593; GFX12-NEXT: s_wait_expcnt 0x0 11594; GFX12-NEXT: s_wait_samplecnt 0x0 11595; GFX12-NEXT: s_wait_bvhcnt 0x0 11596; GFX12-NEXT: s_wait_kmcnt 0x0 11597; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 11598; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11599; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11600; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11601; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 11602; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 11603; GFX12-NEXT: s_mov_b32 s0, 0 11604; GFX12-NEXT: flat_load_b32 v3, v[0:1] 11605; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11606; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 11607; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11608; GFX12-NEXT: v_not_b32_e32 v5, v5 11609; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start 11610; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11611; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11612; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 11613; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11614; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11615; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 11616; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11617; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 11618; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 11619; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11620; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 11621; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11622; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 11623; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11624; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11625; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 11626; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 11627; GFX12-NEXT: global_wb scope:SCOPE_SYS 11628; GFX12-NEXT: s_wait_storecnt 0x0 11629; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 11630; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11631; GFX12-NEXT: global_inv scope:SCOPE_SYS 11632; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11633; GFX12-NEXT: v_mov_b32_e32 v3, v2 11634; GFX12-NEXT: s_wait_alu 0xfffe 11635; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11636; GFX12-NEXT: s_wait_alu 0xfffe 11637; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11638; GFX12-NEXT: s_cbranch_execnz .LBB41_1 11639; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11640; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11641; GFX12-NEXT: s_wait_alu 0xfffe 11642; GFX12-NEXT: s_setpc_b64 s[30:31] 11643; 11644; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11645; GFX940: ; %bb.0: 11646; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11647; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11648; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11649; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11650; GFX940-NEXT: v_mov_b32_e32 v1, v5 11651; GFX940-NEXT: flat_load_dword v3, v[0:1] 11652; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 11653; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11654; GFX940-NEXT: s_mov_b32 s0, 0xffff 11655; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 11656; GFX940-NEXT: v_not_b32_e32 v5, v5 11657; GFX940-NEXT: s_mov_b64 s[0:1], 0 11658; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11659; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11660; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start 11661; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11662; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11663; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11664; GFX940-NEXT: s_nop 0 11665; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 11666; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 11667; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 11668; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 11669; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11670; GFX940-NEXT: s_nop 1 11671; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 11672; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11673; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 11674; GFX940-NEXT: buffer_wbl2 sc0 sc1 11675; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 11676; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11677; GFX940-NEXT: buffer_inv sc0 sc1 11678; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11679; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11680; GFX940-NEXT: v_mov_b32_e32 v3, v2 11681; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11682; GFX940-NEXT: s_cbranch_execnz .LBB41_1 11683; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11684; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11685; GFX940-NEXT: s_setpc_b64 s[30:31] 11686; 11687; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11688; GFX11: ; %bb.0: 11689; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11690; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 11691; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11692; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11693; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11694; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 11695; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 11696; GFX11-NEXT: s_mov_b32 s0, 0 11697; GFX11-NEXT: flat_load_b32 v3, v[0:1] 11698; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11699; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 11700; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11701; GFX11-NEXT: v_not_b32_e32 v5, v5 11702; GFX11-NEXT: .p2align 6 11703; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start 11704; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11705; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11706; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 11707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11708; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11709; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 11710; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11711; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 11712; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 11713; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11714; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 11715; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11716; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 11717; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11718; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11719; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 11720; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 11721; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11722; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 11723; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11724; GFX11-NEXT: buffer_gl1_inv 11725; GFX11-NEXT: buffer_gl0_inv 11726; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11727; GFX11-NEXT: v_mov_b32_e32 v3, v2 11728; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11729; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11730; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11731; GFX11-NEXT: s_cbranch_execnz .LBB41_1 11732; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11733; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11734; GFX11-NEXT: s_setpc_b64 s[30:31] 11735; 11736; GFX10-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11737; GFX10: ; %bb.0: 11738; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11739; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 11740; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11741; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11742; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 11743; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 11744; GFX10-NEXT: s_mov_b32 s4, 0 11745; GFX10-NEXT: flat_load_dword v3, v[0:1] 11746; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11747; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 11748; GFX10-NEXT: v_not_b32_e32 v5, v5 11749; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start 11750; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11751; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11752; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11753; GFX10-NEXT: v_sub_f32_e32 v2, v2, v6 11754; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 11755; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 11756; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11757; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 11758; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 11759; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11760; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 11761; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11762; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11763; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11764; GFX10-NEXT: buffer_gl1_inv 11765; GFX10-NEXT: buffer_gl0_inv 11766; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11767; GFX10-NEXT: v_mov_b32_e32 v3, v2 11768; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11769; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11770; GFX10-NEXT: s_cbranch_execnz .LBB41_1 11771; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11772; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11773; GFX10-NEXT: s_setpc_b64 s[30:31] 11774; 11775; GFX90A-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11776; GFX90A: ; %bb.0: 11777; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11778; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 11779; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11780; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 11781; GFX90A-NEXT: flat_load_dword v3, v[0:1] 11782; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 11783; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11784; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11785; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 11786; GFX90A-NEXT: v_not_b32_e32 v5, v5 11787; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11788; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11789; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11790; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start 11791; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11792; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11793; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11794; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 11795; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 11796; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 11797; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 11798; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11799; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 11800; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11801; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 11802; GFX90A-NEXT: buffer_wbl2 11803; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11804; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11805; GFX90A-NEXT: buffer_invl2 11806; GFX90A-NEXT: buffer_wbinvl1 11807; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11808; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11809; GFX90A-NEXT: v_mov_b32_e32 v3, v2 11810; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11811; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 11812; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11813; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11814; GFX90A-NEXT: s_setpc_b64 s[30:31] 11815; 11816; GFX908-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11817; GFX908: ; %bb.0: 11818; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11819; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 11820; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11821; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 11822; GFX908-NEXT: flat_load_dword v3, v[0:1] 11823; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 11824; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11825; GFX908-NEXT: s_mov_b32 s4, 0xffff 11826; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 11827; GFX908-NEXT: v_not_b32_e32 v5, v5 11828; GFX908-NEXT: s_mov_b64 s[4:5], 0 11829; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11830; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11831; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start 11832; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11833; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11834; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11835; GFX908-NEXT: v_sub_f32_e32 v2, v2, v6 11836; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 11837; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 11838; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 11839; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11840; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 11841; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11842; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 11843; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11844; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11845; GFX908-NEXT: buffer_wbinvl1 11846; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11847; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11848; GFX908-NEXT: v_mov_b32_e32 v3, v2 11849; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11850; GFX908-NEXT: s_cbranch_execnz .LBB41_1 11851; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11852; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11853; GFX908-NEXT: s_setpc_b64 s[30:31] 11854; 11855; GFX8-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11856; GFX8: ; %bb.0: 11857; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11858; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 11859; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11860; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 11861; GFX8-NEXT: flat_load_dword v3, v[0:1] 11862; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 11863; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11864; GFX8-NEXT: s_mov_b32 s4, 0xffff 11865; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 11866; GFX8-NEXT: v_not_b32_e32 v5, v5 11867; GFX8-NEXT: s_mov_b64 s[4:5], 0 11868; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11869; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start 11870; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11871; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11872; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11873; GFX8-NEXT: v_sub_f32_e32 v2, v2, v6 11874; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 11875; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 11876; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11877; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 11878; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11879; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 11880; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 11881; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11882; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 11883; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11884; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11885; GFX8-NEXT: buffer_wbinvl1 11886; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11887; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11888; GFX8-NEXT: v_mov_b32_e32 v3, v2 11889; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11890; GFX8-NEXT: s_cbranch_execnz .LBB41_1 11891; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11892; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11893; GFX8-NEXT: s_setpc_b64 s[30:31] 11894; 11895; GFX7-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: 11896; GFX7: ; %bb.0: 11897; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11898; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 11899; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11900; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 11901; GFX7-NEXT: flat_load_dword v3, v[0:1] 11902; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 11903; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11904; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 11905; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11906; GFX7-NEXT: v_not_b32_e32 v5, v5 11907; GFX7-NEXT: s_mov_b64 s[4:5], 0 11908; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 11909; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start 11910; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11911; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11912; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 11913; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11914; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 11915; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11916; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 11917; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 11918; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 11919; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11920; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11921; GFX7-NEXT: buffer_wbinvl1 11922; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11923; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11924; GFX7-NEXT: v_mov_b32_e32 v3, v2 11925; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11926; GFX7-NEXT: s_cbranch_execnz .LBB41_1 11927; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11928; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11929; GFX7-NEXT: s_setpc_b64 s[30:31] 11930 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11931 %unused = atomicrmw fsub ptr %gep, bfloat %val seq_cst 11932 ret void 11933} 11934 11935; -------------------------------------------------------------------- 11936; <2 x half> 11937; -------------------------------------------------------------------- 11938 11939define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #0 { 11940; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16: 11941; GFX12: ; %bb.0: 11942; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11943; GFX12-NEXT: s_wait_expcnt 0x0 11944; GFX12-NEXT: s_wait_samplecnt 0x0 11945; GFX12-NEXT: s_wait_bvhcnt 0x0 11946; GFX12-NEXT: s_wait_kmcnt 0x0 11947; GFX12-NEXT: flat_load_b32 v3, v[0:1] 11948; GFX12-NEXT: s_mov_b32 s0, 0 11949; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start 11950; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11951; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11952; GFX12-NEXT: v_mov_b32_e32 v4, v3 11953; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11954; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 11955; GFX12-NEXT: s_wait_storecnt 0x0 11956; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11957; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11958; GFX12-NEXT: global_inv scope:SCOPE_DEV 11959; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 11960; GFX12-NEXT: s_wait_alu 0xfffe 11961; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11962; GFX12-NEXT: s_wait_alu 0xfffe 11963; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11964; GFX12-NEXT: s_cbranch_execnz .LBB42_1 11965; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11966; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11967; GFX12-NEXT: v_mov_b32_e32 v0, v3 11968; GFX12-NEXT: s_wait_alu 0xfffe 11969; GFX12-NEXT: s_setpc_b64 s[30:31] 11970; 11971; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16: 11972; GFX940: ; %bb.0: 11973; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11974; GFX940-NEXT: flat_load_dword v3, v[0:1] 11975; GFX940-NEXT: s_mov_b64 s[0:1], 0 11976; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start 11977; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11978; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11979; GFX940-NEXT: v_mov_b32_e32 v5, v3 11980; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 11981; GFX940-NEXT: buffer_wbl2 sc1 11982; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 11983; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11984; GFX940-NEXT: buffer_inv sc1 11985; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 11986; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11987; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11988; GFX940-NEXT: s_cbranch_execnz .LBB42_1 11989; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11990; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11991; GFX940-NEXT: v_mov_b32_e32 v0, v3 11992; GFX940-NEXT: s_setpc_b64 s[30:31] 11993; 11994; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16: 11995; GFX11: ; %bb.0: 11996; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11997; GFX11-NEXT: flat_load_b32 v3, v[0:1] 11998; GFX11-NEXT: s_mov_b32 s0, 0 11999; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start 12000; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12001; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12002; GFX11-NEXT: v_mov_b32_e32 v4, v3 12003; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12004; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12005; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12006; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 12007; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12008; GFX11-NEXT: buffer_gl1_inv 12009; GFX11-NEXT: buffer_gl0_inv 12010; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12011; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12012; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12013; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12014; GFX11-NEXT: s_cbranch_execnz .LBB42_1 12015; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12016; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12017; GFX11-NEXT: v_mov_b32_e32 v0, v3 12018; GFX11-NEXT: s_setpc_b64 s[30:31] 12019; 12020; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16: 12021; GFX10: ; %bb.0: 12022; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12023; GFX10-NEXT: flat_load_dword v3, v[0:1] 12024; GFX10-NEXT: s_mov_b32 s4, 0 12025; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start 12026; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12027; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12028; GFX10-NEXT: v_mov_b32_e32 v4, v3 12029; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12030; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12031; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12032; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12033; GFX10-NEXT: buffer_gl1_inv 12034; GFX10-NEXT: buffer_gl0_inv 12035; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12036; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12037; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12038; GFX10-NEXT: s_cbranch_execnz .LBB42_1 12039; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12040; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12041; GFX10-NEXT: v_mov_b32_e32 v0, v3 12042; GFX10-NEXT: s_setpc_b64 s[30:31] 12043; 12044; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16: 12045; GFX90A: ; %bb.0: 12046; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12047; GFX90A-NEXT: flat_load_dword v3, v[0:1] 12048; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12049; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start 12050; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12051; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12052; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12053; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12054; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 12055; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12056; GFX90A-NEXT: buffer_wbinvl1 12057; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12058; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12059; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12060; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 12061; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12062; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12063; GFX90A-NEXT: v_mov_b32_e32 v0, v3 12064; GFX90A-NEXT: s_setpc_b64 s[30:31] 12065; 12066; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16: 12067; GFX908: ; %bb.0: 12068; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12069; GFX908-NEXT: flat_load_dword v3, v[0:1] 12070; GFX908-NEXT: s_mov_b64 s[4:5], 0 12071; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start 12072; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12073; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12074; GFX908-NEXT: v_mov_b32_e32 v4, v3 12075; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12076; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12077; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12078; GFX908-NEXT: buffer_wbinvl1 12079; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12080; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12081; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12082; GFX908-NEXT: s_cbranch_execnz .LBB42_1 12083; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12084; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12085; GFX908-NEXT: v_mov_b32_e32 v0, v3 12086; GFX908-NEXT: s_setpc_b64 s[30:31] 12087; 12088; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16: 12089; GFX8: ; %bb.0: 12090; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12091; GFX8-NEXT: flat_load_dword v3, v[0:1] 12092; GFX8-NEXT: s_mov_b64 s[4:5], 0 12093; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start 12094; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12095; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12096; GFX8-NEXT: v_mov_b32_e32 v4, v3 12097; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 12098; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12099; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 12100; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12101; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 12102; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12103; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12104; GFX8-NEXT: buffer_wbinvl1 12105; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12106; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12107; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12108; GFX8-NEXT: s_cbranch_execnz .LBB42_1 12109; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12110; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12111; GFX8-NEXT: v_mov_b32_e32 v0, v3 12112; GFX8-NEXT: s_setpc_b64 s[30:31] 12113; 12114; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16: 12115; GFX7: ; %bb.0: 12116; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12117; GFX7-NEXT: flat_load_dword v5, v[0:1] 12118; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 12119; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 12120; GFX7-NEXT: s_mov_b64 s[4:5], 0 12121; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 12122; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12123; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 12124; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 12125; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12126; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 12127; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start 12128; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12129; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 12130; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 12131; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 12132; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 12133; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12134; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 12135; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 12136; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12137; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12138; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 12139; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 12140; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 12141; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc 12142; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12143; GFX7-NEXT: buffer_wbinvl1 12144; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 12145; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 12146; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12147; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12148; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12149; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12150; GFX7-NEXT: s_cbranch_execnz .LBB42_1 12151; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12152; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12153; GFX7-NEXT: v_mov_b32_e32 v0, v2 12154; GFX7-NEXT: v_mov_b32_e32 v1, v3 12155; GFX7-NEXT: s_setpc_b64 s[30:31] 12156 %result = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst 12157 ret <2 x half> %result 12158} 12159 12160define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 { 12161; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12162; GFX12: ; %bb.0: 12163; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12164; GFX12-NEXT: s_wait_expcnt 0x0 12165; GFX12-NEXT: s_wait_samplecnt 0x0 12166; GFX12-NEXT: s_wait_bvhcnt 0x0 12167; GFX12-NEXT: s_wait_kmcnt 0x0 12168; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 12169; GFX12-NEXT: s_mov_b32 s0, 0 12170; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start 12171; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12172; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12173; GFX12-NEXT: v_mov_b32_e32 v4, v3 12174; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 12175; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12176; GFX12-NEXT: s_wait_storecnt 0x0 12177; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12178; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12179; GFX12-NEXT: global_inv scope:SCOPE_DEV 12180; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12181; GFX12-NEXT: s_wait_alu 0xfffe 12182; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12183; GFX12-NEXT: s_wait_alu 0xfffe 12184; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12185; GFX12-NEXT: s_cbranch_execnz .LBB43_1 12186; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12187; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12188; GFX12-NEXT: v_mov_b32_e32 v0, v3 12189; GFX12-NEXT: s_wait_alu 0xfffe 12190; GFX12-NEXT: s_setpc_b64 s[30:31] 12191; 12192; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12193; GFX940: ; %bb.0: 12194; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12195; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 12196; GFX940-NEXT: s_mov_b64 s[0:1], 0 12197; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start 12198; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12199; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12200; GFX940-NEXT: v_mov_b32_e32 v5, v3 12201; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12202; GFX940-NEXT: buffer_wbl2 sc1 12203; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 12204; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12205; GFX940-NEXT: buffer_inv sc1 12206; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12207; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12208; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12209; GFX940-NEXT: s_cbranch_execnz .LBB43_1 12210; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12211; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12212; GFX940-NEXT: v_mov_b32_e32 v0, v3 12213; GFX940-NEXT: s_setpc_b64 s[30:31] 12214; 12215; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12216; GFX11: ; %bb.0: 12217; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12218; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 12219; GFX11-NEXT: s_mov_b32 s0, 0 12220; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start 12221; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12222; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12223; GFX11-NEXT: v_mov_b32_e32 v4, v3 12224; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12225; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12226; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12227; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 12228; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12229; GFX11-NEXT: buffer_gl1_inv 12230; GFX11-NEXT: buffer_gl0_inv 12231; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12232; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12233; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12234; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12235; GFX11-NEXT: s_cbranch_execnz .LBB43_1 12236; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12237; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12238; GFX11-NEXT: v_mov_b32_e32 v0, v3 12239; GFX11-NEXT: s_setpc_b64 s[30:31] 12240; 12241; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12242; GFX10: ; %bb.0: 12243; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12244; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 12245; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 12246; GFX10-NEXT: s_mov_b32 s4, 0 12247; GFX10-NEXT: flat_load_dword v0, v[3:4] 12248; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start 12249; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12250; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12251; GFX10-NEXT: v_mov_b32_e32 v1, v0 12252; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12253; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12254; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 12255; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12256; GFX10-NEXT: buffer_gl1_inv 12257; GFX10-NEXT: buffer_gl0_inv 12258; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 12259; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12260; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12261; GFX10-NEXT: s_cbranch_execnz .LBB43_1 12262; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12263; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12264; GFX10-NEXT: s_setpc_b64 s[30:31] 12265; 12266; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12267; GFX90A: ; %bb.0: 12268; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12269; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 12270; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12271; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start 12272; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12273; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12274; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12275; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12276; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 12277; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12278; GFX90A-NEXT: buffer_wbinvl1 12279; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12280; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12281; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12282; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 12283; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12284; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12285; GFX90A-NEXT: v_mov_b32_e32 v0, v3 12286; GFX90A-NEXT: s_setpc_b64 s[30:31] 12287; 12288; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12289; GFX908: ; %bb.0: 12290; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12291; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 12292; GFX908-NEXT: s_mov_b64 s[4:5], 0 12293; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start 12294; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12295; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12296; GFX908-NEXT: v_mov_b32_e32 v4, v3 12297; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12298; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 12299; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12300; GFX908-NEXT: buffer_wbinvl1 12301; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12302; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12303; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12304; GFX908-NEXT: s_cbranch_execnz .LBB43_1 12305; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12306; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12307; GFX908-NEXT: v_mov_b32_e32 v0, v3 12308; GFX908-NEXT: s_setpc_b64 s[30:31] 12309; 12310; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12311; GFX8: ; %bb.0: 12312; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12313; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 12314; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 12315; GFX8-NEXT: flat_load_dword v0, v[3:4] 12316; GFX8-NEXT: s_mov_b64 s[4:5], 0 12317; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start 12318; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12319; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12320; GFX8-NEXT: v_mov_b32_e32 v1, v0 12321; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 12322; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12323; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 12324; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12325; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 12326; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 12327; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12328; GFX8-NEXT: buffer_wbinvl1 12329; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 12330; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12331; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12332; GFX8-NEXT: s_cbranch_execnz .LBB43_1 12333; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12334; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12335; GFX8-NEXT: s_setpc_b64 s[30:31] 12336; 12337; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: 12338; GFX7: ; %bb.0: 12339; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12340; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 12341; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 12342; GFX7-NEXT: flat_load_dword v1, v[4:5] 12343; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 12344; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 12345; GFX7-NEXT: s_mov_b64 s[4:5], 0 12346; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 12347; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12348; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12349; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 12350; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 12351; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12352; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start 12353; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12354; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 12355; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 12356; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 12357; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 12358; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12359; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 12360; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 12361; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12362; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12363; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 12364; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12365; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 12366; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 12367; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12368; GFX7-NEXT: buffer_wbinvl1 12369; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 12370; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 12371; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12372; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12373; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12374; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12375; GFX7-NEXT: s_cbranch_execnz .LBB43_1 12376; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12377; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12378; GFX7-NEXT: s_setpc_b64 s[30:31] 12379 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 12380 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst 12381 ret <2 x half> %result 12382} 12383 12384define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 { 12385; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12386; GFX12: ; %bb.0: 12387; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12388; GFX12-NEXT: s_wait_expcnt 0x0 12389; GFX12-NEXT: s_wait_samplecnt 0x0 12390; GFX12-NEXT: s_wait_bvhcnt 0x0 12391; GFX12-NEXT: s_wait_kmcnt 0x0 12392; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 12393; GFX12-NEXT: s_mov_b32 s0, 0 12394; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start 12395; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12396; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12397; GFX12-NEXT: v_mov_b32_e32 v4, v3 12398; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 12399; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12400; GFX12-NEXT: s_wait_storecnt 0x0 12401; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12402; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12403; GFX12-NEXT: global_inv scope:SCOPE_DEV 12404; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12405; GFX12-NEXT: s_wait_alu 0xfffe 12406; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12407; GFX12-NEXT: s_wait_alu 0xfffe 12408; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12409; GFX12-NEXT: s_cbranch_execnz .LBB44_1 12410; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12411; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12412; GFX12-NEXT: v_mov_b32_e32 v0, v3 12413; GFX12-NEXT: s_wait_alu 0xfffe 12414; GFX12-NEXT: s_setpc_b64 s[30:31] 12415; 12416; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12417; GFX940: ; %bb.0: 12418; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12419; GFX940-NEXT: v_mov_b32_e32 v4, v0 12420; GFX940-NEXT: v_mov_b32_e32 v5, v1 12421; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 12422; GFX940-NEXT: s_movk_i32 s0, 0xf800 12423; GFX940-NEXT: s_nop 0 12424; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 12425; GFX940-NEXT: flat_load_dword v0, v[0:1] 12426; GFX940-NEXT: s_mov_b32 s1, -1 12427; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 12428; GFX940-NEXT: s_mov_b64 s[0:1], 0 12429; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start 12430; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12431; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12432; GFX940-NEXT: v_mov_b32_e32 v1, v0 12433; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12434; GFX940-NEXT: buffer_wbl2 sc1 12435; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 12436; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12437; GFX940-NEXT: buffer_inv sc1 12438; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 12439; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12440; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12441; GFX940-NEXT: s_cbranch_execnz .LBB44_1 12442; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12443; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12444; GFX940-NEXT: s_setpc_b64 s[30:31] 12445; 12446; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12447; GFX11: ; %bb.0: 12448; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12449; GFX11-NEXT: v_mov_b32_e32 v3, v0 12450; GFX11-NEXT: s_mov_b32 s0, 0 12451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12452; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 12453; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 12454; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 12455; GFX11-NEXT: flat_load_b32 v0, v[4:5] 12456; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 12457; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start 12458; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12459; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12460; GFX11-NEXT: v_mov_b32_e32 v1, v0 12461; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12462; GFX11-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12463; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12464; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc 12465; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12466; GFX11-NEXT: buffer_gl1_inv 12467; GFX11-NEXT: buffer_gl0_inv 12468; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 12469; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12470; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12471; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12472; GFX11-NEXT: s_cbranch_execnz .LBB44_1 12473; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12474; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12475; GFX11-NEXT: s_setpc_b64 s[30:31] 12476; 12477; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12478; GFX10: ; %bb.0: 12479; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12480; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 12481; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 12482; GFX10-NEXT: s_mov_b32 s4, 0 12483; GFX10-NEXT: flat_load_dword v0, v[3:4] 12484; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start 12485; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12486; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12487; GFX10-NEXT: v_mov_b32_e32 v1, v0 12488; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12489; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12490; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 12491; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12492; GFX10-NEXT: buffer_gl1_inv 12493; GFX10-NEXT: buffer_gl0_inv 12494; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 12495; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12496; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12497; GFX10-NEXT: s_cbranch_execnz .LBB44_1 12498; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12499; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12500; GFX10-NEXT: s_setpc_b64 s[30:31] 12501; 12502; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12503; GFX90A: ; %bb.0: 12504; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12505; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 12506; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 12507; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 12508; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12509; GFX90A-NEXT: flat_load_dword v0, v[0:1] 12510; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12511; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start 12512; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12513; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12514; GFX90A-NEXT: v_mov_b32_e32 v1, v0 12515; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12516; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 12517; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12518; GFX90A-NEXT: buffer_wbinvl1 12519; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 12520; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12521; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12522; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 12523; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12524; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12525; GFX90A-NEXT: s_setpc_b64 s[30:31] 12526; 12527; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12528; GFX908: ; %bb.0: 12529; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12530; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 12531; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 12532; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 12533; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12534; GFX908-NEXT: flat_load_dword v0, v[0:1] 12535; GFX908-NEXT: s_mov_b64 s[4:5], 0 12536; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start 12537; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12538; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12539; GFX908-NEXT: v_mov_b32_e32 v1, v0 12540; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 12541; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 12542; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12543; GFX908-NEXT: buffer_wbinvl1 12544; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 12545; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12546; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12547; GFX908-NEXT: s_cbranch_execnz .LBB44_1 12548; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12549; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12550; GFX908-NEXT: s_setpc_b64 s[30:31] 12551; 12552; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12553; GFX8: ; %bb.0: 12554; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12555; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 12556; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 12557; GFX8-NEXT: flat_load_dword v0, v[3:4] 12558; GFX8-NEXT: s_mov_b64 s[4:5], 0 12559; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start 12560; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12561; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12562; GFX8-NEXT: v_mov_b32_e32 v1, v0 12563; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 12564; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12565; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 12566; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12567; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 12568; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 12569; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12570; GFX8-NEXT: buffer_wbinvl1 12571; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 12572; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12573; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12574; GFX8-NEXT: s_cbranch_execnz .LBB44_1 12575; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12576; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12577; GFX8-NEXT: s_setpc_b64 s[30:31] 12578; 12579; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: 12580; GFX7: ; %bb.0: 12581; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12582; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 12583; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 12584; GFX7-NEXT: flat_load_dword v1, v[4:5] 12585; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 12586; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 12587; GFX7-NEXT: s_mov_b64 s[4:5], 0 12588; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 12589; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12590; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12591; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 12592; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 12593; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12594; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start 12595; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12596; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 12597; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 12598; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 12599; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 12600; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12601; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 12602; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 12603; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12604; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12605; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 12606; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12607; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 12608; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 12609; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12610; GFX7-NEXT: buffer_wbinvl1 12611; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 12612; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 12613; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12614; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12615; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12616; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12617; GFX7-NEXT: s_cbranch_execnz .LBB44_1 12618; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12619; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12620; GFX7-NEXT: s_setpc_b64 s[30:31] 12621 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 12622 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst 12623 ret <2 x half> %result 12624} 12625 12626define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { 12627; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12628; GFX12: ; %bb.0: 12629; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12630; GFX12-NEXT: s_wait_expcnt 0x0 12631; GFX12-NEXT: s_wait_samplecnt 0x0 12632; GFX12-NEXT: s_wait_bvhcnt 0x0 12633; GFX12-NEXT: s_wait_kmcnt 0x0 12634; GFX12-NEXT: flat_load_b32 v4, v[0:1] 12635; GFX12-NEXT: s_mov_b32 s0, 0 12636; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start 12637; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12638; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12639; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12640; GFX12-NEXT: s_wait_storecnt 0x0 12641; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12642; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12643; GFX12-NEXT: global_inv scope:SCOPE_DEV 12644; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12645; GFX12-NEXT: v_mov_b32_e32 v4, v3 12646; GFX12-NEXT: s_wait_alu 0xfffe 12647; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12648; GFX12-NEXT: s_wait_alu 0xfffe 12649; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12650; GFX12-NEXT: s_cbranch_execnz .LBB45_1 12651; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12652; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12653; GFX12-NEXT: s_wait_alu 0xfffe 12654; GFX12-NEXT: s_setpc_b64 s[30:31] 12655; 12656; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12657; GFX940: ; %bb.0: 12658; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12659; GFX940-NEXT: flat_load_dword v5, v[0:1] 12660; GFX940-NEXT: s_mov_b64 s[0:1], 0 12661; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start 12662; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12663; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12664; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12665; GFX940-NEXT: buffer_wbl2 sc1 12666; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 12667; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12668; GFX940-NEXT: buffer_inv sc1 12669; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12670; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12671; GFX940-NEXT: v_mov_b32_e32 v5, v3 12672; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12673; GFX940-NEXT: s_cbranch_execnz .LBB45_1 12674; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12675; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12676; GFX940-NEXT: s_setpc_b64 s[30:31] 12677; 12678; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12679; GFX11: ; %bb.0: 12680; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12681; GFX11-NEXT: flat_load_b32 v4, v[0:1] 12682; GFX11-NEXT: s_mov_b32 s0, 0 12683; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start 12684; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12685; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12686; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12687; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12688; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 12689; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12690; GFX11-NEXT: buffer_gl1_inv 12691; GFX11-NEXT: buffer_gl0_inv 12692; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12693; GFX11-NEXT: v_mov_b32_e32 v4, v3 12694; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12695; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12696; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12697; GFX11-NEXT: s_cbranch_execnz .LBB45_1 12698; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12699; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12700; GFX11-NEXT: s_setpc_b64 s[30:31] 12701; 12702; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12703; GFX10: ; %bb.0: 12704; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12705; GFX10-NEXT: flat_load_dword v4, v[0:1] 12706; GFX10-NEXT: s_mov_b32 s4, 0 12707; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start 12708; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12709; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12710; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12711; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12712; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12713; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12714; GFX10-NEXT: buffer_gl1_inv 12715; GFX10-NEXT: buffer_gl0_inv 12716; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12717; GFX10-NEXT: v_mov_b32_e32 v4, v3 12718; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12719; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12720; GFX10-NEXT: s_cbranch_execnz .LBB45_1 12721; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12722; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12723; GFX10-NEXT: s_setpc_b64 s[30:31] 12724; 12725; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12726; GFX90A: ; %bb.0: 12727; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12728; GFX90A-NEXT: flat_load_dword v5, v[0:1] 12729; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12730; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start 12731; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12732; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12733; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12734; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 12735; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12736; GFX90A-NEXT: buffer_wbinvl1 12737; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12738; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12739; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12740; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12741; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 12742; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12743; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12744; GFX90A-NEXT: s_setpc_b64 s[30:31] 12745; 12746; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12747; GFX908: ; %bb.0: 12748; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12749; GFX908-NEXT: flat_load_dword v4, v[0:1] 12750; GFX908-NEXT: s_mov_b64 s[4:5], 0 12751; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start 12752; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12753; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12754; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12755; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12756; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12757; GFX908-NEXT: buffer_wbinvl1 12758; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12759; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12760; GFX908-NEXT: v_mov_b32_e32 v4, v3 12761; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12762; GFX908-NEXT: s_cbranch_execnz .LBB45_1 12763; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12764; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12765; GFX908-NEXT: s_setpc_b64 s[30:31] 12766; 12767; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12768; GFX8: ; %bb.0: 12769; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12770; GFX8-NEXT: flat_load_dword v4, v[0:1] 12771; GFX8-NEXT: s_mov_b64 s[4:5], 0 12772; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start 12773; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12774; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12775; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 12776; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12777; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 12778; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12779; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 12780; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12781; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12782; GFX8-NEXT: buffer_wbinvl1 12783; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12784; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12785; GFX8-NEXT: v_mov_b32_e32 v4, v3 12786; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12787; GFX8-NEXT: s_cbranch_execnz .LBB45_1 12788; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12789; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12790; GFX8-NEXT: s_setpc_b64 s[30:31] 12791; 12792; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16: 12793; GFX7: ; %bb.0: 12794; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12795; GFX7-NEXT: flat_load_dword v5, v[0:1] 12796; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 12797; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 12798; GFX7-NEXT: s_mov_b64 s[4:5], 0 12799; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 12800; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12801; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 12802; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 12803; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 12804; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 12805; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start 12806; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12807; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 12808; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 12809; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 12810; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 12811; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 12812; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 12813; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 12814; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 12815; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 12816; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 12817; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 12818; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 12819; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 12820; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12821; GFX7-NEXT: buffer_wbinvl1 12822; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 12823; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 12824; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 12825; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 12826; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12827; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12828; GFX7-NEXT: s_cbranch_execnz .LBB45_1 12829; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12830; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12831; GFX7-NEXT: s_setpc_b64 s[30:31] 12832 %unused = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst 12833 ret void 12834} 12835 12836define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 { 12837; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12838; GFX12: ; %bb.0: 12839; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12840; GFX12-NEXT: s_wait_expcnt 0x0 12841; GFX12-NEXT: s_wait_samplecnt 0x0 12842; GFX12-NEXT: s_wait_bvhcnt 0x0 12843; GFX12-NEXT: s_wait_kmcnt 0x0 12844; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 12845; GFX12-NEXT: s_mov_b32 s0, 0 12846; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start 12847; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12848; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12849; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12850; GFX12-NEXT: s_wait_storecnt 0x0 12851; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12852; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12853; GFX12-NEXT: global_inv scope:SCOPE_DEV 12854; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12855; GFX12-NEXT: v_mov_b32_e32 v4, v3 12856; GFX12-NEXT: s_wait_alu 0xfffe 12857; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12858; GFX12-NEXT: s_wait_alu 0xfffe 12859; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12860; GFX12-NEXT: s_cbranch_execnz .LBB46_1 12861; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12862; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12863; GFX12-NEXT: s_wait_alu 0xfffe 12864; GFX12-NEXT: s_setpc_b64 s[30:31] 12865; 12866; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12867; GFX940: ; %bb.0: 12868; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12869; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 12870; GFX940-NEXT: s_mov_b64 s[0:1], 0 12871; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start 12872; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12873; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12874; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12875; GFX940-NEXT: buffer_wbl2 sc1 12876; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 12877; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12878; GFX940-NEXT: buffer_inv sc1 12879; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12880; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12881; GFX940-NEXT: v_mov_b32_e32 v5, v3 12882; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12883; GFX940-NEXT: s_cbranch_execnz .LBB46_1 12884; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12885; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12886; GFX940-NEXT: s_setpc_b64 s[30:31] 12887; 12888; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12889; GFX11: ; %bb.0: 12890; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12891; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 12892; GFX11-NEXT: s_mov_b32 s0, 0 12893; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start 12894; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12895; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12896; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12897; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12898; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 12899; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12900; GFX11-NEXT: buffer_gl1_inv 12901; GFX11-NEXT: buffer_gl0_inv 12902; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12903; GFX11-NEXT: v_mov_b32_e32 v4, v3 12904; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12905; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12906; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12907; GFX11-NEXT: s_cbranch_execnz .LBB46_1 12908; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12909; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12910; GFX11-NEXT: s_setpc_b64 s[30:31] 12911; 12912; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12913; GFX10: ; %bb.0: 12914; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12915; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 12916; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 12917; GFX10-NEXT: s_mov_b32 s4, 0 12918; GFX10-NEXT: flat_load_dword v4, v[0:1] 12919; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start 12920; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12921; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12922; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12923; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12924; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12925; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12926; GFX10-NEXT: buffer_gl1_inv 12927; GFX10-NEXT: buffer_gl0_inv 12928; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12929; GFX10-NEXT: v_mov_b32_e32 v4, v3 12930; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12931; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12932; GFX10-NEXT: s_cbranch_execnz .LBB46_1 12933; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12934; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12935; GFX10-NEXT: s_setpc_b64 s[30:31] 12936; 12937; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12938; GFX90A: ; %bb.0: 12939; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12940; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 12941; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12942; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start 12943; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12944; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12945; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 12946; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 12947; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12948; GFX90A-NEXT: buffer_wbinvl1 12949; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12950; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12951; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12952; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12953; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 12954; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12955; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12956; GFX90A-NEXT: s_setpc_b64 s[30:31] 12957; 12958; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12959; GFX908: ; %bb.0: 12960; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12961; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 12962; GFX908-NEXT: s_mov_b64 s[4:5], 0 12963; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start 12964; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12965; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12966; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 12967; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 12968; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12969; GFX908-NEXT: buffer_wbinvl1 12970; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12971; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12972; GFX908-NEXT: v_mov_b32_e32 v4, v3 12973; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12974; GFX908-NEXT: s_cbranch_execnz .LBB46_1 12975; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12976; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12977; GFX908-NEXT: s_setpc_b64 s[30:31] 12978; 12979; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 12980; GFX8: ; %bb.0: 12981; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12982; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 12983; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12984; GFX8-NEXT: flat_load_dword v4, v[0:1] 12985; GFX8-NEXT: s_mov_b64 s[4:5], 0 12986; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start 12987; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12988; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12989; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 12990; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12991; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 12992; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12993; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 12994; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12995; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12996; GFX8-NEXT: buffer_wbinvl1 12997; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12998; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12999; GFX8-NEXT: v_mov_b32_e32 v4, v3 13000; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13001; GFX8-NEXT: s_cbranch_execnz .LBB46_1 13002; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13003; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13004; GFX8-NEXT: s_setpc_b64 s[30:31] 13005; 13006; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: 13007; GFX7: ; %bb.0: 13008; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13009; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 13010; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13011; GFX7-NEXT: flat_load_dword v5, v[0:1] 13012; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13013; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13014; GFX7-NEXT: s_mov_b64 s[4:5], 0 13015; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13016; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13017; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13018; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13019; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13020; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13021; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start 13022; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13023; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13024; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13025; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13026; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13027; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13028; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 13029; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 13030; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13031; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13032; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13033; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13034; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13035; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13036; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13037; GFX7-NEXT: buffer_wbinvl1 13038; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13039; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13040; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13041; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13042; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13043; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13044; GFX7-NEXT: s_cbranch_execnz .LBB46_1 13045; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13046; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13047; GFX7-NEXT: s_setpc_b64 s[30:31] 13048 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 13049 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst 13050 ret void 13051} 13052 13053define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 { 13054; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13055; GFX12: ; %bb.0: 13056; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13057; GFX12-NEXT: s_wait_expcnt 0x0 13058; GFX12-NEXT: s_wait_samplecnt 0x0 13059; GFX12-NEXT: s_wait_bvhcnt 0x0 13060; GFX12-NEXT: s_wait_kmcnt 0x0 13061; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 13062; GFX12-NEXT: s_mov_b32 s0, 0 13063; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start 13064; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13065; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13066; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13067; GFX12-NEXT: s_wait_storecnt 0x0 13068; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13069; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13070; GFX12-NEXT: global_inv scope:SCOPE_DEV 13071; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13072; GFX12-NEXT: v_mov_b32_e32 v4, v3 13073; GFX12-NEXT: s_wait_alu 0xfffe 13074; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13075; GFX12-NEXT: s_wait_alu 0xfffe 13076; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13077; GFX12-NEXT: s_cbranch_execnz .LBB47_1 13078; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13079; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13080; GFX12-NEXT: s_wait_alu 0xfffe 13081; GFX12-NEXT: s_setpc_b64 s[30:31] 13082; 13083; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13084; GFX940: ; %bb.0: 13085; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13086; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 13087; GFX940-NEXT: s_movk_i32 s0, 0xf800 13088; GFX940-NEXT: s_nop 0 13089; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 13090; GFX940-NEXT: flat_load_dword v5, v[4:5] 13091; GFX940-NEXT: s_mov_b32 s1, -1 13092; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 13093; GFX940-NEXT: s_mov_b64 s[0:1], 0 13094; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start 13095; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13096; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13097; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 13098; GFX940-NEXT: buffer_wbl2 sc1 13099; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 13100; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13101; GFX940-NEXT: buffer_inv sc1 13102; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13103; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13104; GFX940-NEXT: v_mov_b32_e32 v5, v3 13105; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13106; GFX940-NEXT: s_cbranch_execnz .LBB47_1 13107; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13108; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13109; GFX940-NEXT: s_setpc_b64 s[30:31] 13110; 13111; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13112; GFX11: ; %bb.0: 13113; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13114; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 13115; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 13116; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 13117; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 13118; GFX11-NEXT: flat_load_b32 v4, v[3:4] 13119; GFX11-NEXT: s_mov_b32 s0, 0 13120; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start 13121; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13122; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13123; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13124; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13125; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 13126; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13127; GFX11-NEXT: buffer_gl1_inv 13128; GFX11-NEXT: buffer_gl0_inv 13129; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13130; GFX11-NEXT: v_mov_b32_e32 v4, v3 13131; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13132; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13133; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13134; GFX11-NEXT: s_cbranch_execnz .LBB47_1 13135; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13136; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13137; GFX11-NEXT: s_setpc_b64 s[30:31] 13138; 13139; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13140; GFX10: ; %bb.0: 13141; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13142; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 13143; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 13144; GFX10-NEXT: s_mov_b32 s4, 0 13145; GFX10-NEXT: flat_load_dword v4, v[0:1] 13146; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start 13147; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13148; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13149; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13150; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13151; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13152; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13153; GFX10-NEXT: buffer_gl1_inv 13154; GFX10-NEXT: buffer_gl0_inv 13155; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13156; GFX10-NEXT: v_mov_b32_e32 v4, v3 13157; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13158; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13159; GFX10-NEXT: s_cbranch_execnz .LBB47_1 13160; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13161; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13162; GFX10-NEXT: s_setpc_b64 s[30:31] 13163; 13164; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13165; GFX90A: ; %bb.0: 13166; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13167; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 13168; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 13169; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 13170; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 13171; GFX90A-NEXT: flat_load_dword v1, v[0:1] 13172; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13173; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start 13174; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13175; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13176; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 13177; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 13178; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13179; GFX90A-NEXT: buffer_wbinvl1 13180; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 13181; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13182; GFX90A-NEXT: v_mov_b32_e32 v1, v0 13183; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13184; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 13185; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13186; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13187; GFX90A-NEXT: s_setpc_b64 s[30:31] 13188; 13189; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13190; GFX908: ; %bb.0: 13191; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13192; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 13193; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 13194; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 13195; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 13196; GFX908-NEXT: flat_load_dword v1, v[0:1] 13197; GFX908-NEXT: s_mov_b64 s[4:5], 0 13198; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start 13199; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13200; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13201; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 13202; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 13203; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13204; GFX908-NEXT: buffer_wbinvl1 13205; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 13206; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13207; GFX908-NEXT: v_mov_b32_e32 v1, v0 13208; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13209; GFX908-NEXT: s_cbranch_execnz .LBB47_1 13210; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13211; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13212; GFX908-NEXT: s_setpc_b64 s[30:31] 13213; 13214; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13215; GFX8: ; %bb.0: 13216; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13217; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 13218; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 13219; GFX8-NEXT: flat_load_dword v4, v[0:1] 13220; GFX8-NEXT: s_mov_b64 s[4:5], 0 13221; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start 13222; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13223; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13224; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 13225; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13226; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 13227; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13228; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 13229; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13230; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13231; GFX8-NEXT: buffer_wbinvl1 13232; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13233; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13234; GFX8-NEXT: v_mov_b32_e32 v4, v3 13235; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13236; GFX8-NEXT: s_cbranch_execnz .LBB47_1 13237; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13238; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13239; GFX8-NEXT: s_setpc_b64 s[30:31] 13240; 13241; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: 13242; GFX7: ; %bb.0: 13243; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13244; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 13245; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 13246; GFX7-NEXT: flat_load_dword v5, v[0:1] 13247; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13248; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13249; GFX7-NEXT: s_mov_b64 s[4:5], 0 13250; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13251; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13252; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13253; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13254; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13255; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13256; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start 13257; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13258; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13259; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13260; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13261; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13262; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13263; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 13264; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 13265; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13266; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13267; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13268; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13269; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13270; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13271; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13272; GFX7-NEXT: buffer_wbinvl1 13273; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13274; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13275; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13276; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13277; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13278; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13279; GFX7-NEXT: s_cbranch_execnz .LBB47_1 13280; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13281; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13282; GFX7-NEXT: s_setpc_b64 s[30:31] 13283 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 13284 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst 13285 ret void 13286} 13287 13288define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 { 13289; GFX12-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13290; GFX12: ; %bb.0: 13291; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13292; GFX12-NEXT: s_wait_expcnt 0x0 13293; GFX12-NEXT: s_wait_samplecnt 0x0 13294; GFX12-NEXT: s_wait_bvhcnt 0x0 13295; GFX12-NEXT: s_wait_kmcnt 0x0 13296; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13297; GFX12-NEXT: s_mov_b32 s0, 0 13298; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start 13299; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13300; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13301; GFX12-NEXT: v_mov_b32_e32 v4, v3 13302; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13303; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13304; GFX12-NEXT: global_wb scope:SCOPE_SYS 13305; GFX12-NEXT: s_wait_storecnt 0x0 13306; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13307; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13308; GFX12-NEXT: global_inv scope:SCOPE_SYS 13309; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13310; GFX12-NEXT: s_wait_alu 0xfffe 13311; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13312; GFX12-NEXT: s_wait_alu 0xfffe 13313; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13314; GFX12-NEXT: s_cbranch_execnz .LBB48_1 13315; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13316; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13317; GFX12-NEXT: v_mov_b32_e32 v0, v3 13318; GFX12-NEXT: s_wait_alu 0xfffe 13319; GFX12-NEXT: s_setpc_b64 s[30:31] 13320; 13321; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13322; GFX940: ; %bb.0: 13323; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13324; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 13325; GFX940-NEXT: s_mov_b64 s[0:1], 0 13326; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start 13327; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13328; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13329; GFX940-NEXT: v_mov_b32_e32 v5, v3 13330; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 13331; GFX940-NEXT: buffer_wbl2 sc0 sc1 13332; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 13333; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13334; GFX940-NEXT: buffer_inv sc0 sc1 13335; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13336; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13337; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13338; GFX940-NEXT: s_cbranch_execnz .LBB48_1 13339; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13340; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13341; GFX940-NEXT: v_mov_b32_e32 v0, v3 13342; GFX940-NEXT: s_setpc_b64 s[30:31] 13343; 13344; GFX11-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13345; GFX11: ; %bb.0: 13346; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13347; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13348; GFX11-NEXT: s_mov_b32 s0, 0 13349; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start 13350; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13351; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13352; GFX11-NEXT: v_mov_b32_e32 v4, v3 13353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13354; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13355; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13356; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 13357; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13358; GFX11-NEXT: buffer_gl1_inv 13359; GFX11-NEXT: buffer_gl0_inv 13360; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13361; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13362; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13363; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13364; GFX11-NEXT: s_cbranch_execnz .LBB48_1 13365; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13366; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13367; GFX11-NEXT: v_mov_b32_e32 v0, v3 13368; GFX11-NEXT: s_setpc_b64 s[30:31] 13369; 13370; GFX10-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13371; GFX10: ; %bb.0: 13372; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13373; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 13374; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 13375; GFX10-NEXT: s_mov_b32 s4, 0 13376; GFX10-NEXT: flat_load_dword v0, v[3:4] 13377; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start 13378; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13379; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13380; GFX10-NEXT: v_mov_b32_e32 v1, v0 13381; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] 13382; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13383; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 13384; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13385; GFX10-NEXT: buffer_gl1_inv 13386; GFX10-NEXT: buffer_gl0_inv 13387; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 13388; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13389; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13390; GFX10-NEXT: s_cbranch_execnz .LBB48_1 13391; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13392; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13393; GFX10-NEXT: s_setpc_b64 s[30:31] 13394; 13395; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13396; GFX90A: ; %bb.0: 13397; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13398; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 13399; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13400; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start 13401; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13402; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13403; GFX90A-NEXT: v_mov_b32_e32 v5, v3 13404; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 13405; GFX90A-NEXT: buffer_wbl2 13406; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 13407; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13408; GFX90A-NEXT: buffer_invl2 13409; GFX90A-NEXT: buffer_wbinvl1 13410; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13411; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13412; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13413; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 13414; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13415; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13416; GFX90A-NEXT: v_mov_b32_e32 v0, v3 13417; GFX90A-NEXT: s_setpc_b64 s[30:31] 13418; 13419; GFX908-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13420; GFX908: ; %bb.0: 13421; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13422; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 13423; GFX908-NEXT: s_mov_b64 s[4:5], 0 13424; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start 13425; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13426; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13427; GFX908-NEXT: v_mov_b32_e32 v4, v3 13428; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13429; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 13430; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13431; GFX908-NEXT: buffer_wbinvl1 13432; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13433; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13434; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13435; GFX908-NEXT: s_cbranch_execnz .LBB48_1 13436; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13437; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13438; GFX908-NEXT: v_mov_b32_e32 v0, v3 13439; GFX908-NEXT: s_setpc_b64 s[30:31] 13440; 13441; GFX8-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13442; GFX8: ; %bb.0: 13443; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13444; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 13445; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 13446; GFX8-NEXT: flat_load_dword v0, v[3:4] 13447; GFX8-NEXT: s_mov_b64 s[4:5], 0 13448; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start 13449; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13450; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13451; GFX8-NEXT: v_mov_b32_e32 v1, v0 13452; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 13453; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13454; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 13455; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13456; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 13457; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 13458; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13459; GFX8-NEXT: buffer_wbinvl1 13460; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 13461; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13462; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13463; GFX8-NEXT: s_cbranch_execnz .LBB48_1 13464; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13465; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13466; GFX8-NEXT: s_setpc_b64 s[30:31] 13467; 13468; GFX7-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: 13469; GFX7: ; %bb.0: 13470; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13471; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 13472; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 13473; GFX7-NEXT: flat_load_dword v1, v[4:5] 13474; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 13475; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 13476; GFX7-NEXT: s_mov_b64 s[4:5], 0 13477; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 13478; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 13479; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13480; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 13481; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13482; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 13483; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start 13484; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13485; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 13486; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 13487; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 13488; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 13489; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13490; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 13491; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 13492; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 13493; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 13494; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 13495; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 13496; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 13497; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 13498; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13499; GFX7-NEXT: buffer_wbinvl1 13500; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 13501; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 13502; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 13503; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 13504; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13505; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13506; GFX7-NEXT: s_cbranch_execnz .LBB48_1 13507; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13508; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13509; GFX7-NEXT: s_setpc_b64 s[30:31] 13510 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 13511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst 13512 ret <2 x half> %result 13513} 13514 13515define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 { 13516; GFX12-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13517; GFX12: ; %bb.0: 13518; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13519; GFX12-NEXT: s_wait_expcnt 0x0 13520; GFX12-NEXT: s_wait_samplecnt 0x0 13521; GFX12-NEXT: s_wait_bvhcnt 0x0 13522; GFX12-NEXT: s_wait_kmcnt 0x0 13523; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 13524; GFX12-NEXT: s_mov_b32 s0, 0 13525; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start 13526; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13527; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13528; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13529; GFX12-NEXT: global_wb scope:SCOPE_SYS 13530; GFX12-NEXT: s_wait_storecnt 0x0 13531; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13532; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13533; GFX12-NEXT: global_inv scope:SCOPE_SYS 13534; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13535; GFX12-NEXT: v_mov_b32_e32 v4, v3 13536; GFX12-NEXT: s_wait_alu 0xfffe 13537; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13538; GFX12-NEXT: s_wait_alu 0xfffe 13539; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13540; GFX12-NEXT: s_cbranch_execnz .LBB49_1 13541; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13542; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13543; GFX12-NEXT: s_wait_alu 0xfffe 13544; GFX12-NEXT: s_setpc_b64 s[30:31] 13545; 13546; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13547; GFX940: ; %bb.0: 13548; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13549; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 13550; GFX940-NEXT: s_mov_b64 s[0:1], 0 13551; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start 13552; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13553; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13554; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 13555; GFX940-NEXT: buffer_wbl2 sc0 sc1 13556; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 13557; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13558; GFX940-NEXT: buffer_inv sc0 sc1 13559; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13560; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13561; GFX940-NEXT: v_mov_b32_e32 v5, v3 13562; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13563; GFX940-NEXT: s_cbranch_execnz .LBB49_1 13564; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13565; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13566; GFX940-NEXT: s_setpc_b64 s[30:31] 13567; 13568; GFX11-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13569; GFX11: ; %bb.0: 13570; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13571; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 13572; GFX11-NEXT: s_mov_b32 s0, 0 13573; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start 13574; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13575; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13576; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13577; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13578; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 13579; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13580; GFX11-NEXT: buffer_gl1_inv 13581; GFX11-NEXT: buffer_gl0_inv 13582; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13583; GFX11-NEXT: v_mov_b32_e32 v4, v3 13584; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13585; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13586; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13587; GFX11-NEXT: s_cbranch_execnz .LBB49_1 13588; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13589; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13590; GFX11-NEXT: s_setpc_b64 s[30:31] 13591; 13592; GFX10-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13593; GFX10: ; %bb.0: 13594; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13595; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 13596; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13597; GFX10-NEXT: s_mov_b32 s4, 0 13598; GFX10-NEXT: flat_load_dword v4, v[0:1] 13599; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start 13600; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13601; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13602; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13603; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13604; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13605; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13606; GFX10-NEXT: buffer_gl1_inv 13607; GFX10-NEXT: buffer_gl0_inv 13608; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13609; GFX10-NEXT: v_mov_b32_e32 v4, v3 13610; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13611; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13612; GFX10-NEXT: s_cbranch_execnz .LBB49_1 13613; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13614; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13615; GFX10-NEXT: s_setpc_b64 s[30:31] 13616; 13617; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13618; GFX90A: ; %bb.0: 13619; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13620; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 13621; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13622; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start 13623; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13624; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13625; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] 13626; GFX90A-NEXT: buffer_wbl2 13627; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 13628; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13629; GFX90A-NEXT: buffer_invl2 13630; GFX90A-NEXT: buffer_wbinvl1 13631; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13632; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13633; GFX90A-NEXT: v_mov_b32_e32 v5, v3 13634; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13635; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 13636; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13637; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13638; GFX90A-NEXT: s_setpc_b64 s[30:31] 13639; 13640; GFX908-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13641; GFX908: ; %bb.0: 13642; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13643; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 13644; GFX908-NEXT: s_mov_b64 s[4:5], 0 13645; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start 13646; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13647; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13648; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] 13649; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 13650; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13651; GFX908-NEXT: buffer_wbinvl1 13652; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13653; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13654; GFX908-NEXT: v_mov_b32_e32 v4, v3 13655; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13656; GFX908-NEXT: s_cbranch_execnz .LBB49_1 13657; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13658; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13659; GFX908-NEXT: s_setpc_b64 s[30:31] 13660; 13661; GFX8-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13662; GFX8: ; %bb.0: 13663; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13664; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 13665; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13666; GFX8-NEXT: flat_load_dword v4, v[0:1] 13667; GFX8-NEXT: s_mov_b64 s[4:5], 0 13668; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start 13669; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13670; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13671; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 13672; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13673; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 13674; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13675; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 13676; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 13677; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13678; GFX8-NEXT: buffer_wbinvl1 13679; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13680; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13681; GFX8-NEXT: v_mov_b32_e32 v4, v3 13682; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13683; GFX8-NEXT: s_cbranch_execnz .LBB49_1 13684; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13685; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13686; GFX8-NEXT: s_setpc_b64 s[30:31] 13687; 13688; GFX7-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: 13689; GFX7: ; %bb.0: 13690; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13691; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 13692; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13693; GFX7-NEXT: flat_load_dword v5, v[0:1] 13694; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13695; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13696; GFX7-NEXT: s_mov_b64 s[4:5], 0 13697; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13698; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13699; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13700; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13701; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13702; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13703; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start 13704; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13705; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13706; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13707; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13708; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13709; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13710; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 13711; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 13712; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13713; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13714; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13715; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13716; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13717; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13718; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13719; GFX7-NEXT: buffer_wbinvl1 13720; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13721; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13722; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13723; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13724; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13725; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13726; GFX7-NEXT: s_cbranch_execnz .LBB49_1 13727; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13728; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13729; GFX7-NEXT: s_setpc_b64 s[30:31] 13730 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 13731 %unused = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst 13732 ret void 13733} 13734 13735; -------------------------------------------------------------------- 13736; <2 x bfloat> 13737; -------------------------------------------------------------------- 13738 13739define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 { 13740; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13741; GFX12: ; %bb.0: 13742; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13743; GFX12-NEXT: s_wait_expcnt 0x0 13744; GFX12-NEXT: s_wait_samplecnt 0x0 13745; GFX12-NEXT: s_wait_bvhcnt 0x0 13746; GFX12-NEXT: s_wait_kmcnt 0x0 13747; GFX12-NEXT: flat_load_b32 v3, v[0:1] 13748; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13749; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13750; GFX12-NEXT: s_mov_b32 s1, 0 13751; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start 13752; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13753; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13754; GFX12-NEXT: v_mov_b32_e32 v6, v3 13755; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13756; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 13757; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 13758; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 13759; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13760; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 13761; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 13762; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 13763; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13764; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13765; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 13766; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 13767; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 13768; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 13769; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13770; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 13771; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13772; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13773; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 13774; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 13775; GFX12-NEXT: s_wait_storecnt 0x0 13776; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13777; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13778; GFX12-NEXT: global_inv scope:SCOPE_DEV 13779; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 13780; GFX12-NEXT: s_wait_alu 0xfffe 13781; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 13782; GFX12-NEXT: s_wait_alu 0xfffe 13783; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 13784; GFX12-NEXT: s_cbranch_execnz .LBB50_1 13785; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13786; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 13787; GFX12-NEXT: v_mov_b32_e32 v0, v3 13788; GFX12-NEXT: s_wait_alu 0xfffe 13789; GFX12-NEXT: s_setpc_b64 s[30:31] 13790; 13791; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13792; GFX940: ; %bb.0: 13793; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13794; GFX940-NEXT: flat_load_dword v3, v[0:1] 13795; GFX940-NEXT: s_mov_b64 s[2:3], 0 13796; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13797; GFX940-NEXT: s_movk_i32 s4, 0x7fff 13798; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13799; GFX940-NEXT: s_mov_b32 s5, 0x7060302 13800; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start 13801; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13802; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13803; GFX940-NEXT: v_mov_b32_e32 v7, v3 13804; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 13805; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 13806; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 13807; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 13808; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 13809; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 13810; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 13811; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 13812; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 13813; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 13814; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13815; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 13816; GFX940-NEXT: s_nop 0 13817; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 13818; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 13819; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 13820; GFX940-NEXT: buffer_wbl2 sc1 13821; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 13822; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13823; GFX940-NEXT: buffer_inv sc1 13824; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 13825; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 13826; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 13827; GFX940-NEXT: s_cbranch_execnz .LBB50_1 13828; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13829; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 13830; GFX940-NEXT: v_mov_b32_e32 v0, v3 13831; GFX940-NEXT: s_setpc_b64 s[30:31] 13832; 13833; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13834; GFX11: ; %bb.0: 13835; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13836; GFX11-NEXT: flat_load_b32 v3, v[0:1] 13837; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13838; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13839; GFX11-NEXT: s_mov_b32 s1, 0 13840; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 13841; GFX11-NEXT: .p2align 6 13842; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start 13843; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13844; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13845; GFX11-NEXT: v_mov_b32_e32 v6, v3 13846; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13847; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 13848; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 13849; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 13850; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13851; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 13852; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 13853; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 13854; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13855; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13856; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 13857; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 13858; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 13859; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 13860; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13861; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 13862; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13863; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13864; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 13865; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 13866; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13867; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc 13868; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13869; GFX11-NEXT: buffer_gl1_inv 13870; GFX11-NEXT: buffer_gl0_inv 13871; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 13872; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 13873; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13874; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 13875; GFX11-NEXT: s_cbranch_execnz .LBB50_1 13876; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13877; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 13878; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 13879; GFX11-NEXT: v_mov_b32_e32 v0, v3 13880; GFX11-NEXT: s_setpc_b64 s[30:31] 13881; 13882; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13883; GFX10: ; %bb.0: 13884; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13885; GFX10-NEXT: flat_load_dword v3, v[0:1] 13886; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13887; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13888; GFX10-NEXT: s_mov_b32 s5, 0 13889; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start 13890; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13891; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13892; GFX10-NEXT: v_mov_b32_e32 v6, v3 13893; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 13894; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 13895; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 13896; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 13897; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 13898; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 13899; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 13900; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 13901; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 13902; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 13903; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 13904; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 13905; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 13906; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 13907; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 13908; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13909; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 13910; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13911; GFX10-NEXT: buffer_gl1_inv 13912; GFX10-NEXT: buffer_gl0_inv 13913; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 13914; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 13915; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 13916; GFX10-NEXT: s_cbranch_execnz .LBB50_1 13917; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13918; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 13919; GFX10-NEXT: v_mov_b32_e32 v0, v3 13920; GFX10-NEXT: s_setpc_b64 s[30:31] 13921; 13922; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13923; GFX90A: ; %bb.0: 13924; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13925; GFX90A-NEXT: flat_load_dword v3, v[0:1] 13926; GFX90A-NEXT: s_mov_b64 s[6:7], 0 13927; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13928; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 13929; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13930; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 13931; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start 13932; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13933; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13934; GFX90A-NEXT: v_mov_b32_e32 v7, v3 13935; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 13936; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 13937; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 13938; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 13939; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 13940; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 13941; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 13942; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 13943; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 13944; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 13945; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13946; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 13947; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 13948; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 13949; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 13950; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc 13951; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13952; GFX90A-NEXT: buffer_wbinvl1 13953; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 13954; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 13955; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 13956; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 13957; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13958; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 13959; GFX90A-NEXT: v_mov_b32_e32 v0, v3 13960; GFX90A-NEXT: s_setpc_b64 s[30:31] 13961; 13962; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 13963; GFX908: ; %bb.0: 13964; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13965; GFX908-NEXT: flat_load_dword v3, v[0:1] 13966; GFX908-NEXT: s_mov_b64 s[6:7], 0 13967; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13968; GFX908-NEXT: s_movk_i32 s8, 0x7fff 13969; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13970; GFX908-NEXT: s_mov_b32 s9, 0x7060302 13971; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start 13972; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13973; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13974; GFX908-NEXT: v_mov_b32_e32 v6, v3 13975; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 13976; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 13977; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 13978; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 13979; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 13980; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 13981; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 13982; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 13983; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 13984; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 13985; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 13986; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 13987; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 13988; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 13989; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 13990; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 13991; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13992; GFX908-NEXT: buffer_wbinvl1 13993; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 13994; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 13995; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 13996; GFX908-NEXT: s_cbranch_execnz .LBB50_1 13997; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13998; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 13999; GFX908-NEXT: v_mov_b32_e32 v0, v3 14000; GFX908-NEXT: s_setpc_b64 s[30:31] 14001; 14002; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 14003; GFX8: ; %bb.0: 14004; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14005; GFX8-NEXT: flat_load_dword v3, v[0:1] 14006; GFX8-NEXT: s_mov_b64 s[6:7], 0 14007; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14008; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14009; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start 14010; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14011; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14012; GFX8-NEXT: v_mov_b32_e32 v6, v3 14013; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14014; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14015; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 14016; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 14017; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 14018; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14019; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 14020; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14021; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 14022; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14023; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14024; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14025; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 14026; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14027; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14028; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 14029; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14030; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 14031; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 14032; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14033; GFX8-NEXT: buffer_wbinvl1 14034; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 14035; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14036; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 14037; GFX8-NEXT: s_cbranch_execnz .LBB50_1 14038; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14039; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 14040; GFX8-NEXT: v_mov_b32_e32 v0, v3 14041; GFX8-NEXT: s_setpc_b64 s[30:31] 14042; 14043; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16: 14044; GFX7: ; %bb.0: 14045; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14046; GFX7-NEXT: flat_load_dword v5, v[0:1] 14047; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14048; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 14049; GFX7-NEXT: s_mov_b64 s[4:5], 0 14050; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 14051; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14052; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 14053; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 14054; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14055; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start 14056; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14057; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14058; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14059; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 14060; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14061; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14062; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 14063; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 14064; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 14065; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 14066; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 14067; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc 14068; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14069; GFX7-NEXT: buffer_wbinvl1 14070; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 14071; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 14072; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14073; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14074; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14075; GFX7-NEXT: s_cbranch_execnz .LBB50_1 14076; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14077; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14078; GFX7-NEXT: v_mov_b32_e32 v0, v3 14079; GFX7-NEXT: v_mov_b32_e32 v1, v2 14080; GFX7-NEXT: s_setpc_b64 s[30:31] 14081 %result = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst 14082 ret <2 x bfloat> %result 14083} 14084 14085define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { 14086; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14087; GFX12: ; %bb.0: 14088; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14089; GFX12-NEXT: s_wait_expcnt 0x0 14090; GFX12-NEXT: s_wait_samplecnt 0x0 14091; GFX12-NEXT: s_wait_bvhcnt 0x0 14092; GFX12-NEXT: s_wait_kmcnt 0x0 14093; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14094; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14095; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14096; GFX12-NEXT: s_mov_b32 s1, 0 14097; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start 14098; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14099; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14100; GFX12-NEXT: v_mov_b32_e32 v6, v3 14101; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14102; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14103; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 14104; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14105; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14106; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 14107; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 14108; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 14109; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14110; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14111; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14112; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 14113; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 14114; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14115; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14116; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14117; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14118; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14119; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14120; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14121; GFX12-NEXT: s_wait_storecnt 0x0 14122; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14123; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14124; GFX12-NEXT: global_inv scope:SCOPE_DEV 14125; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14126; GFX12-NEXT: s_wait_alu 0xfffe 14127; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14128; GFX12-NEXT: s_wait_alu 0xfffe 14129; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14130; GFX12-NEXT: s_cbranch_execnz .LBB51_1 14131; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14132; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14133; GFX12-NEXT: v_mov_b32_e32 v0, v3 14134; GFX12-NEXT: s_wait_alu 0xfffe 14135; GFX12-NEXT: s_setpc_b64 s[30:31] 14136; 14137; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14138; GFX940: ; %bb.0: 14139; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14140; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 14141; GFX940-NEXT: s_mov_b64 s[2:3], 0 14142; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14143; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14144; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14145; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14146; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start 14147; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14148; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14149; GFX940-NEXT: v_mov_b32_e32 v7, v3 14150; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14151; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14152; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 14153; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 14154; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 14155; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 14156; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 14157; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 14158; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 14159; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 14160; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14161; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 14162; GFX940-NEXT: s_nop 0 14163; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14164; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 14165; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 14166; GFX940-NEXT: buffer_wbl2 sc1 14167; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 14168; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14169; GFX940-NEXT: buffer_inv sc1 14170; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14171; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14172; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14173; GFX940-NEXT: s_cbranch_execnz .LBB51_1 14174; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14175; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14176; GFX940-NEXT: v_mov_b32_e32 v0, v3 14177; GFX940-NEXT: s_setpc_b64 s[30:31] 14178; 14179; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14180; GFX11: ; %bb.0: 14181; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14182; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14183; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14184; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14185; GFX11-NEXT: s_mov_b32 s1, 0 14186; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 14187; GFX11-NEXT: .p2align 6 14188; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start 14189; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14190; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14191; GFX11-NEXT: v_mov_b32_e32 v6, v3 14192; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14193; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14194; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 14195; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14196; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14197; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 14198; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 14199; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 14200; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14201; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14202; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14203; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 14204; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 14205; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14206; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14207; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14208; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14209; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14210; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14211; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14212; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14213; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 14214; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14215; GFX11-NEXT: buffer_gl1_inv 14216; GFX11-NEXT: buffer_gl0_inv 14217; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14218; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 14219; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14220; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14221; GFX11-NEXT: s_cbranch_execnz .LBB51_1 14222; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14223; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 14224; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 14225; GFX11-NEXT: v_mov_b32_e32 v0, v3 14226; GFX11-NEXT: s_setpc_b64 s[30:31] 14227; 14228; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14229; GFX10: ; %bb.0: 14230; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14231; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 14232; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 14233; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14234; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14235; GFX10-NEXT: s_mov_b32 s5, 0 14236; GFX10-NEXT: flat_load_dword v0, v[3:4] 14237; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start 14238; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14239; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14240; GFX10-NEXT: v_mov_b32_e32 v6, v0 14241; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14242; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14243; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 14244; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 14245; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 14246; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 14247; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 14248; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 14249; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14250; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 14251; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14252; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 14253; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14254; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 14255; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 14256; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14257; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14258; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14259; GFX10-NEXT: buffer_gl1_inv 14260; GFX10-NEXT: buffer_gl0_inv 14261; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 14262; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 14263; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 14264; GFX10-NEXT: s_cbranch_execnz .LBB51_1 14265; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14266; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 14267; GFX10-NEXT: s_setpc_b64 s[30:31] 14268; 14269; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14270; GFX90A: ; %bb.0: 14271; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14272; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 14273; GFX90A-NEXT: s_mov_b64 s[6:7], 0 14274; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14275; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 14276; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14277; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 14278; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start 14279; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14280; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14281; GFX90A-NEXT: v_mov_b32_e32 v7, v3 14282; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14283; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14284; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 14285; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 14286; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 14287; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 14288; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 14289; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 14290; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 14291; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 14292; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14293; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14294; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 14295; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14296; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 14297; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 14298; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14299; GFX90A-NEXT: buffer_wbinvl1 14300; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14301; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14302; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 14303; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 14304; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14305; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 14306; GFX90A-NEXT: v_mov_b32_e32 v0, v3 14307; GFX90A-NEXT: s_setpc_b64 s[30:31] 14308; 14309; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14310; GFX908: ; %bb.0: 14311; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14312; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 14313; GFX908-NEXT: s_mov_b64 s[6:7], 0 14314; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14315; GFX908-NEXT: s_movk_i32 s8, 0x7fff 14316; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14317; GFX908-NEXT: s_mov_b32 s9, 0x7060302 14318; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start 14319; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14320; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14321; GFX908-NEXT: v_mov_b32_e32 v6, v3 14322; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14323; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14324; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 14325; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 14326; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 14327; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 14328; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 14329; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 14330; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 14331; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 14332; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14333; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14334; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 14335; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14336; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 14337; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 14338; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14339; GFX908-NEXT: buffer_wbinvl1 14340; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 14341; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14342; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 14343; GFX908-NEXT: s_cbranch_execnz .LBB51_1 14344; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14345; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 14346; GFX908-NEXT: v_mov_b32_e32 v0, v3 14347; GFX908-NEXT: s_setpc_b64 s[30:31] 14348; 14349; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14350; GFX8: ; %bb.0: 14351; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14352; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 14353; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 14354; GFX8-NEXT: flat_load_dword v0, v[3:4] 14355; GFX8-NEXT: s_mov_b64 s[6:7], 0 14356; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14357; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14358; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start 14359; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14360; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14361; GFX8-NEXT: v_mov_b32_e32 v6, v0 14362; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14363; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14364; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 14365; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 14366; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 14367; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14368; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 14369; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14370; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 14371; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14372; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14373; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14374; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 14375; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 14376; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14377; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 14378; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14379; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 14380; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14381; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14382; GFX8-NEXT: buffer_wbinvl1 14383; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 14384; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14385; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 14386; GFX8-NEXT: s_cbranch_execnz .LBB51_1 14387; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14388; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 14389; GFX8-NEXT: s_setpc_b64 s[30:31] 14390; 14391; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: 14392; GFX7: ; %bb.0: 14393; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14394; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 14395; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 14396; GFX7-NEXT: flat_load_dword v0, v[4:5] 14397; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 14398; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14399; GFX7-NEXT: s_mov_b64 s[4:5], 0 14400; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 14401; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14402; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14403; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14404; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14405; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start 14406; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14407; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 14408; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 14409; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 14410; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 14411; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14412; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 14413; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 14414; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 14415; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 14416; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 14417; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 14418; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14419; GFX7-NEXT: buffer_wbinvl1 14420; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14421; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14422; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14423; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14424; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14425; GFX7-NEXT: s_cbranch_execnz .LBB51_1 14426; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14427; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14428; GFX7-NEXT: s_setpc_b64 s[30:31] 14429 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 14430 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst 14431 ret <2 x bfloat> %result 14432} 14433 14434define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 { 14435; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14436; GFX12: ; %bb.0: 14437; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14438; GFX12-NEXT: s_wait_expcnt 0x0 14439; GFX12-NEXT: s_wait_samplecnt 0x0 14440; GFX12-NEXT: s_wait_bvhcnt 0x0 14441; GFX12-NEXT: s_wait_kmcnt 0x0 14442; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 14443; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14444; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14445; GFX12-NEXT: s_mov_b32 s1, 0 14446; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start 14447; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14448; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14449; GFX12-NEXT: v_mov_b32_e32 v6, v3 14450; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14451; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14452; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 14453; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14454; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14455; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 14456; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 14457; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 14458; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14459; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14460; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14461; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 14462; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 14463; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14464; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14465; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14466; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14467; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14468; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14469; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14470; GFX12-NEXT: s_wait_storecnt 0x0 14471; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14472; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14473; GFX12-NEXT: global_inv scope:SCOPE_DEV 14474; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14475; GFX12-NEXT: s_wait_alu 0xfffe 14476; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14477; GFX12-NEXT: s_wait_alu 0xfffe 14478; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14479; GFX12-NEXT: s_cbranch_execnz .LBB52_1 14480; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14481; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14482; GFX12-NEXT: v_mov_b32_e32 v0, v3 14483; GFX12-NEXT: s_wait_alu 0xfffe 14484; GFX12-NEXT: s_setpc_b64 s[30:31] 14485; 14486; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14487; GFX940: ; %bb.0: 14488; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14489; GFX940-NEXT: v_mov_b32_e32 v4, v0 14490; GFX940-NEXT: v_mov_b32_e32 v5, v1 14491; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 14492; GFX940-NEXT: s_movk_i32 s0, 0xf800 14493; GFX940-NEXT: s_nop 0 14494; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 14495; GFX940-NEXT: flat_load_dword v0, v[0:1] 14496; GFX940-NEXT: s_mov_b32 s1, -1 14497; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 14498; GFX940-NEXT: s_mov_b64 s[2:3], 0 14499; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14500; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14501; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14502; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14503; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start 14504; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14505; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14506; GFX940-NEXT: v_mov_b32_e32 v7, v0 14507; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 14508; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 14509; GFX940-NEXT: v_sub_f32_e32 v0, v0, v1 14510; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 14511; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 14512; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 14513; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 14514; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 14515; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 14516; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 14517; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14518; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 14519; GFX940-NEXT: s_nop 0 14520; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc 14521; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] 14522; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 14523; GFX940-NEXT: buffer_wbl2 sc1 14524; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 14525; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14526; GFX940-NEXT: buffer_inv sc1 14527; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 14528; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14529; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14530; GFX940-NEXT: s_cbranch_execnz .LBB52_1 14531; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14532; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14533; GFX940-NEXT: s_setpc_b64 s[30:31] 14534; 14535; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14536; GFX11: ; %bb.0: 14537; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14538; GFX11-NEXT: v_mov_b32_e32 v3, v0 14539; GFX11-NEXT: s_mov_b32 s1, 0 14540; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14541; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 14542; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 14543; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 14544; GFX11-NEXT: flat_load_b32 v0, v[4:5] 14545; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 14546; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14547; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14548; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 14549; GFX11-NEXT: .p2align 6 14550; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start 14551; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14552; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14553; GFX11-NEXT: v_mov_b32_e32 v6, v0 14554; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14555; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14556; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 14557; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14558; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14559; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 14560; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 14561; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 14562; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14564; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14565; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 14566; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 14567; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 14568; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14569; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14570; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 14571; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14572; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 14573; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 14574; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14575; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc 14576; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14577; GFX11-NEXT: buffer_gl1_inv 14578; GFX11-NEXT: buffer_gl0_inv 14579; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 14580; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 14581; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14582; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14583; GFX11-NEXT: s_cbranch_execnz .LBB52_1 14584; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14585; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 14586; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 14587; GFX11-NEXT: s_setpc_b64 s[30:31] 14588; 14589; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14590; GFX10: ; %bb.0: 14591; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14592; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 14593; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 14594; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14595; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14596; GFX10-NEXT: s_mov_b32 s5, 0 14597; GFX10-NEXT: flat_load_dword v0, v[3:4] 14598; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start 14599; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14600; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14601; GFX10-NEXT: v_mov_b32_e32 v6, v0 14602; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14603; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14604; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 14605; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 14606; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 14607; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 14608; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 14609; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 14610; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14611; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 14612; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14613; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 14614; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14615; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 14616; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 14617; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14618; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14619; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14620; GFX10-NEXT: buffer_gl1_inv 14621; GFX10-NEXT: buffer_gl0_inv 14622; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 14623; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 14624; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 14625; GFX10-NEXT: s_cbranch_execnz .LBB52_1 14626; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14627; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 14628; GFX10-NEXT: s_setpc_b64 s[30:31] 14629; 14630; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14631; GFX90A: ; %bb.0: 14632; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14633; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 14634; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 14635; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 14636; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 14637; GFX90A-NEXT: flat_load_dword v0, v[0:1] 14638; GFX90A-NEXT: s_mov_b64 s[6:7], 0 14639; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14640; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 14641; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14642; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 14643; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start 14644; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14645; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14646; GFX90A-NEXT: v_mov_b32_e32 v7, v0 14647; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 14648; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 14649; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 14650; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 14651; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 14652; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 14653; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 14654; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 14655; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 14656; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 14657; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14658; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 14659; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] 14660; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc 14661; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 14662; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc 14663; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14664; GFX90A-NEXT: buffer_wbinvl1 14665; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 14666; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14667; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 14668; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 14669; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14670; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 14671; GFX90A-NEXT: s_setpc_b64 s[30:31] 14672; 14673; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14674; GFX908: ; %bb.0: 14675; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14676; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 14677; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 14678; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 14679; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 14680; GFX908-NEXT: flat_load_dword v0, v[0:1] 14681; GFX908-NEXT: s_mov_b64 s[6:7], 0 14682; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14683; GFX908-NEXT: s_movk_i32 s8, 0x7fff 14684; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14685; GFX908-NEXT: s_mov_b32 s9, 0x7060302 14686; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start 14687; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14688; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14689; GFX908-NEXT: v_mov_b32_e32 v6, v0 14690; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14691; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14692; GFX908-NEXT: v_sub_f32_e32 v0, v0, v1 14693; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 14694; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 14695; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 14696; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 14697; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 14698; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 14699; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 14700; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14701; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 14702; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 14703; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14704; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 14705; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14706; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14707; GFX908-NEXT: buffer_wbinvl1 14708; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 14709; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14710; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 14711; GFX908-NEXT: s_cbranch_execnz .LBB52_1 14712; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14713; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 14714; GFX908-NEXT: s_setpc_b64 s[30:31] 14715; 14716; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14717; GFX8: ; %bb.0: 14718; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14719; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 14720; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 14721; GFX8-NEXT: flat_load_dword v0, v[3:4] 14722; GFX8-NEXT: s_mov_b64 s[6:7], 0 14723; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14724; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14725; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start 14726; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14727; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14728; GFX8-NEXT: v_mov_b32_e32 v6, v0 14729; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14730; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14731; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 14732; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 14733; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 14734; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14735; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 14736; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14737; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 14738; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14739; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14740; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14741; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 14742; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 14743; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14744; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 14745; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14746; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 14747; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14748; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14749; GFX8-NEXT: buffer_wbinvl1 14750; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 14751; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14752; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 14753; GFX8-NEXT: s_cbranch_execnz .LBB52_1 14754; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14755; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 14756; GFX8-NEXT: s_setpc_b64 s[30:31] 14757; 14758; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: 14759; GFX7: ; %bb.0: 14760; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14761; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 14762; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 14763; GFX7-NEXT: flat_load_dword v0, v[4:5] 14764; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 14765; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14766; GFX7-NEXT: s_mov_b64 s[4:5], 0 14767; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 14768; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14769; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14770; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14771; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14772; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start 14773; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14774; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 14775; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 14776; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 14777; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 14778; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14779; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 14780; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 14781; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 14782; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 14783; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 14784; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 14785; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14786; GFX7-NEXT: buffer_wbinvl1 14787; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14788; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14789; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14790; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14791; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14792; GFX7-NEXT: s_cbranch_execnz .LBB52_1 14793; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14794; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14795; GFX7-NEXT: s_setpc_b64 s[30:31] 14796 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 14797 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst 14798 ret <2 x bfloat> %result 14799} 14800 14801define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 { 14802; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 14803; GFX12: ; %bb.0: 14804; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14805; GFX12-NEXT: s_wait_expcnt 0x0 14806; GFX12-NEXT: s_wait_samplecnt 0x0 14807; GFX12-NEXT: s_wait_bvhcnt 0x0 14808; GFX12-NEXT: s_wait_kmcnt 0x0 14809; GFX12-NEXT: flat_load_b32 v3, v[0:1] 14810; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14811; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 14812; GFX12-NEXT: s_mov_b32 s1, 0 14813; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start 14814; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14815; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14816; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 14817; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14818; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14819; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 14820; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 14821; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14822; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 14823; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 14824; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 14825; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 14826; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 14827; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 14828; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 14829; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 14830; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14831; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 14832; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 14833; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 14834; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 14835; GFX12-NEXT: s_wait_storecnt 0x0 14836; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14837; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14838; GFX12-NEXT: global_inv scope:SCOPE_DEV 14839; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14840; GFX12-NEXT: v_mov_b32_e32 v3, v2 14841; GFX12-NEXT: s_wait_alu 0xfffe 14842; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14843; GFX12-NEXT: s_wait_alu 0xfffe 14844; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14845; GFX12-NEXT: s_cbranch_execnz .LBB53_1 14846; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14847; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14848; GFX12-NEXT: s_wait_alu 0xfffe 14849; GFX12-NEXT: s_setpc_b64 s[30:31] 14850; 14851; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 14852; GFX940: ; %bb.0: 14853; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14854; GFX940-NEXT: flat_load_dword v3, v[0:1] 14855; GFX940-NEXT: s_mov_b64 s[2:3], 0 14856; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14857; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14858; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 14859; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14860; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start 14861; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14862; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14863; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 14864; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14865; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 14866; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 14867; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 14868; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 14869; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 14870; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 14871; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 14872; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 14873; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 14874; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 14875; GFX940-NEXT: s_nop 0 14876; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 14877; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 14878; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 14879; GFX940-NEXT: buffer_wbl2 sc1 14880; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 14881; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14882; GFX940-NEXT: buffer_inv sc1 14883; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14884; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14885; GFX940-NEXT: v_mov_b32_e32 v3, v2 14886; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14887; GFX940-NEXT: s_cbranch_execnz .LBB53_1 14888; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14889; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14890; GFX940-NEXT: s_setpc_b64 s[30:31] 14891; 14892; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 14893; GFX11: ; %bb.0: 14894; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14895; GFX11-NEXT: flat_load_b32 v3, v[0:1] 14896; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14897; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 14898; GFX11-NEXT: s_mov_b32 s1, 0 14899; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 14900; GFX11-NEXT: .p2align 6 14901; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start 14902; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14903; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14904; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 14905; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14906; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14907; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 14908; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 14909; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14910; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 14911; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 14912; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 14913; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 14914; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 14915; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 14916; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 14917; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 14918; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14919; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 14920; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 14921; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14922; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 14923; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14924; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 14925; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14926; GFX11-NEXT: buffer_gl1_inv 14927; GFX11-NEXT: buffer_gl0_inv 14928; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14929; GFX11-NEXT: v_mov_b32_e32 v3, v2 14930; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 14931; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14932; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14933; GFX11-NEXT: s_cbranch_execnz .LBB53_1 14934; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14935; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 14936; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 14937; GFX11-NEXT: s_setpc_b64 s[30:31] 14938; 14939; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 14940; GFX10: ; %bb.0: 14941; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14942; GFX10-NEXT: flat_load_dword v3, v[0:1] 14943; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14944; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 14945; GFX10-NEXT: s_mov_b32 s5, 0 14946; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start 14947; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14948; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14949; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 14950; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14951; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 14952; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 14953; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 14954; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 14955; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 14956; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 14957; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 14958; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 14959; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 14960; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 14961; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 14962; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 14963; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 14964; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14965; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14966; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14967; GFX10-NEXT: buffer_gl1_inv 14968; GFX10-NEXT: buffer_gl0_inv 14969; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14970; GFX10-NEXT: v_mov_b32_e32 v3, v2 14971; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 14972; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 14973; GFX10-NEXT: s_cbranch_execnz .LBB53_1 14974; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14975; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 14976; GFX10-NEXT: s_setpc_b64 s[30:31] 14977; 14978; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 14979; GFX90A: ; %bb.0: 14980; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14981; GFX90A-NEXT: flat_load_dword v3, v[0:1] 14982; GFX90A-NEXT: s_mov_b64 s[6:7], 0 14983; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14984; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 14985; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 14986; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 14987; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start 14988; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14989; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14990; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 14991; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 14992; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 14993; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 14994; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 14995; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 14996; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 14997; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 14998; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 14999; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 15000; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15001; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15002; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15003; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15004; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 15005; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15006; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15007; GFX90A-NEXT: buffer_wbinvl1 15008; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15009; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15010; GFX90A-NEXT: v_mov_b32_e32 v3, v2 15011; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15012; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 15013; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15014; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15015; GFX90A-NEXT: s_setpc_b64 s[30:31] 15016; 15017; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 15018; GFX908: ; %bb.0: 15019; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15020; GFX908-NEXT: flat_load_dword v3, v[0:1] 15021; GFX908-NEXT: s_mov_b64 s[6:7], 0 15022; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15023; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15024; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15025; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15026; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start 15027; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15028; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15029; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15030; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15031; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 15032; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 15033; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 15034; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 15035; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 15036; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 15037; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 15038; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 15039; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15040; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15041; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15042; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15043; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 15044; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15045; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15046; GFX908-NEXT: buffer_wbinvl1 15047; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15048; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15049; GFX908-NEXT: v_mov_b32_e32 v3, v2 15050; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15051; GFX908-NEXT: s_cbranch_execnz .LBB53_1 15052; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15053; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15054; GFX908-NEXT: s_setpc_b64 s[30:31] 15055; 15056; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 15057; GFX8: ; %bb.0: 15058; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15059; GFX8-NEXT: flat_load_dword v3, v[0:1] 15060; GFX8-NEXT: s_mov_b64 s[6:7], 0 15061; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15062; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15063; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start 15064; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15065; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15066; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15067; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15068; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 15069; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 15070; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 15071; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 15072; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 15073; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 15074; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15075; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15076; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 15077; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15078; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 15079; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15080; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15081; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15082; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 15083; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 15084; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15085; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15086; GFX8-NEXT: buffer_wbinvl1 15087; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15088; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15089; GFX8-NEXT: v_mov_b32_e32 v3, v2 15090; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15091; GFX8-NEXT: s_cbranch_execnz .LBB53_1 15092; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15093; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15094; GFX8-NEXT: s_setpc_b64 s[30:31] 15095; 15096; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16: 15097; GFX7: ; %bb.0: 15098; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15099; GFX7-NEXT: flat_load_dword v5, v[0:1] 15100; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 15101; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15102; GFX7-NEXT: s_mov_b64 s[4:5], 0 15103; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15104; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15105; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15106; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 15107; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15108; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start 15109; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15110; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15111; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15112; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 15113; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 15114; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 15115; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 15116; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 15117; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 15118; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 15119; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 15120; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 15121; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15122; GFX7-NEXT: buffer_wbinvl1 15123; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 15124; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 15125; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15126; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 15127; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15128; GFX7-NEXT: s_cbranch_execnz .LBB53_1 15129; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15130; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15131; GFX7-NEXT: s_setpc_b64 s[30:31] 15132 %unused = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst 15133 ret void 15134} 15135 15136define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { 15137; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15138; GFX12: ; %bb.0: 15139; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15140; GFX12-NEXT: s_wait_expcnt 0x0 15141; GFX12-NEXT: s_wait_samplecnt 0x0 15142; GFX12-NEXT: s_wait_bvhcnt 0x0 15143; GFX12-NEXT: s_wait_kmcnt 0x0 15144; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15145; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15146; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15147; GFX12-NEXT: s_mov_b32 s1, 0 15148; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start 15149; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15150; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15151; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15152; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15153; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15154; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 15155; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 15156; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15157; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 15158; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 15159; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 15160; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 15161; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15162; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15163; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15164; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15165; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15166; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15167; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15168; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 15169; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15170; GFX12-NEXT: s_wait_storecnt 0x0 15171; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15172; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15173; GFX12-NEXT: global_inv scope:SCOPE_DEV 15174; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15175; GFX12-NEXT: v_mov_b32_e32 v3, v2 15176; GFX12-NEXT: s_wait_alu 0xfffe 15177; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15178; GFX12-NEXT: s_wait_alu 0xfffe 15179; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15180; GFX12-NEXT: s_cbranch_execnz .LBB54_1 15181; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15182; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15183; GFX12-NEXT: s_wait_alu 0xfffe 15184; GFX12-NEXT: s_setpc_b64 s[30:31] 15185; 15186; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15187; GFX940: ; %bb.0: 15188; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15189; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 15190; GFX940-NEXT: s_mov_b64 s[2:3], 0 15191; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15192; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15193; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15194; GFX940-NEXT: s_mov_b32 s5, 0x7060302 15195; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start 15196; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 15197; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15198; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15199; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15200; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 15201; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 15202; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 15203; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 15204; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 15205; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 15206; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 15207; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 15208; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15209; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 15210; GFX940-NEXT: s_nop 0 15211; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15212; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 15213; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 15214; GFX940-NEXT: buffer_wbl2 sc1 15215; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 15216; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15217; GFX940-NEXT: buffer_inv sc1 15218; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15219; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 15220; GFX940-NEXT: v_mov_b32_e32 v3, v2 15221; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 15222; GFX940-NEXT: s_cbranch_execnz .LBB54_1 15223; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 15224; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 15225; GFX940-NEXT: s_setpc_b64 s[30:31] 15226; 15227; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15228; GFX11: ; %bb.0: 15229; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15230; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15231; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15232; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15233; GFX11-NEXT: s_mov_b32 s1, 0 15234; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15235; GFX11-NEXT: .p2align 6 15236; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start 15237; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15238; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15239; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15240; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15241; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15242; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 15243; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 15244; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15245; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 15246; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 15247; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 15248; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 15249; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15250; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15251; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15252; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15253; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15254; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15255; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15256; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15257; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15258; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15259; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 15260; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15261; GFX11-NEXT: buffer_gl1_inv 15262; GFX11-NEXT: buffer_gl0_inv 15263; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15264; GFX11-NEXT: v_mov_b32_e32 v3, v2 15265; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15266; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15267; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15268; GFX11-NEXT: s_cbranch_execnz .LBB54_1 15269; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15270; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15271; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15272; GFX11-NEXT: s_setpc_b64 s[30:31] 15273; 15274; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15275; GFX10: ; %bb.0: 15276; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15277; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 15278; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 15279; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15280; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15281; GFX10-NEXT: s_mov_b32 s5, 0 15282; GFX10-NEXT: flat_load_dword v3, v[0:1] 15283; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start 15284; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15285; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15286; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15287; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15288; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 15289; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 15290; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 15291; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 15292; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 15293; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 15294; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15295; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15296; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15297; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 15298; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15299; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 15300; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15301; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15302; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15303; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15304; GFX10-NEXT: buffer_gl1_inv 15305; GFX10-NEXT: buffer_gl0_inv 15306; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15307; GFX10-NEXT: v_mov_b32_e32 v3, v2 15308; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 15309; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 15310; GFX10-NEXT: s_cbranch_execnz .LBB54_1 15311; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15312; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 15313; GFX10-NEXT: s_setpc_b64 s[30:31] 15314; 15315; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15316; GFX90A: ; %bb.0: 15317; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15318; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 15319; GFX90A-NEXT: s_mov_b64 s[6:7], 0 15320; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15321; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 15322; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15323; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 15324; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start 15325; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15326; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15327; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15328; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15329; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 15330; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 15331; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 15332; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 15333; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 15334; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 15335; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 15336; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 15337; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15338; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15339; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15340; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15341; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 15342; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 15343; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15344; GFX90A-NEXT: buffer_wbinvl1 15345; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15346; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15347; GFX90A-NEXT: v_mov_b32_e32 v3, v2 15348; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15349; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 15350; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15351; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15352; GFX90A-NEXT: s_setpc_b64 s[30:31] 15353; 15354; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15355; GFX908: ; %bb.0: 15356; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15357; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 15358; GFX908-NEXT: s_mov_b64 s[6:7], 0 15359; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15360; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15361; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15362; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15363; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start 15364; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15365; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15366; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15367; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15368; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 15369; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 15370; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 15371; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 15372; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 15373; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 15374; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 15375; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 15376; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15377; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15378; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15379; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15380; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 15381; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 15382; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15383; GFX908-NEXT: buffer_wbinvl1 15384; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15385; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15386; GFX908-NEXT: v_mov_b32_e32 v3, v2 15387; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15388; GFX908-NEXT: s_cbranch_execnz .LBB54_1 15389; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15390; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15391; GFX908-NEXT: s_setpc_b64 s[30:31] 15392; 15393; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15394; GFX8: ; %bb.0: 15395; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15396; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 15397; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15398; GFX8-NEXT: flat_load_dword v3, v[0:1] 15399; GFX8-NEXT: s_mov_b64 s[6:7], 0 15400; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15401; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15402; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start 15403; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15404; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15405; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15406; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15407; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 15408; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 15409; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 15410; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 15411; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 15412; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 15413; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15414; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15415; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 15416; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15417; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 15418; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15419; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15420; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15421; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 15422; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 15423; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15424; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15425; GFX8-NEXT: buffer_wbinvl1 15426; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15427; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15428; GFX8-NEXT: v_mov_b32_e32 v3, v2 15429; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15430; GFX8-NEXT: s_cbranch_execnz .LBB54_1 15431; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15432; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15433; GFX8-NEXT: s_setpc_b64 s[30:31] 15434; 15435; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: 15436; GFX7: ; %bb.0: 15437; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15438; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 15439; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15440; GFX7-NEXT: flat_load_dword v5, v[0:1] 15441; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 15442; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15443; GFX7-NEXT: s_mov_b64 s[4:5], 0 15444; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15445; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15446; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15447; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 15448; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15449; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start 15450; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15451; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15452; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15453; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 15454; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 15455; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 15456; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 15457; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 15458; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 15459; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 15460; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 15461; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 15462; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15463; GFX7-NEXT: buffer_wbinvl1 15464; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 15465; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 15466; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15467; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 15468; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15469; GFX7-NEXT: s_cbranch_execnz .LBB54_1 15470; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15471; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15472; GFX7-NEXT: s_setpc_b64 s[30:31] 15473 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 15474 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst 15475 ret void 15476} 15477 15478define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 { 15479; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15480; GFX12: ; %bb.0: 15481; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15482; GFX12-NEXT: s_wait_expcnt 0x0 15483; GFX12-NEXT: s_wait_samplecnt 0x0 15484; GFX12-NEXT: s_wait_bvhcnt 0x0 15485; GFX12-NEXT: s_wait_kmcnt 0x0 15486; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 15487; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15488; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15489; GFX12-NEXT: s_mov_b32 s1, 0 15490; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start 15491; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15492; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15493; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15494; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15495; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15496; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 15497; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 15498; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15499; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 15500; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 15501; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 15502; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 15503; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15504; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15505; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15506; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15507; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15508; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15509; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15510; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 15511; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15512; GFX12-NEXT: s_wait_storecnt 0x0 15513; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15514; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15515; GFX12-NEXT: global_inv scope:SCOPE_DEV 15516; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15517; GFX12-NEXT: v_mov_b32_e32 v3, v2 15518; GFX12-NEXT: s_wait_alu 0xfffe 15519; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15520; GFX12-NEXT: s_wait_alu 0xfffe 15521; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15522; GFX12-NEXT: s_cbranch_execnz .LBB55_1 15523; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15524; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15525; GFX12-NEXT: s_wait_alu 0xfffe 15526; GFX12-NEXT: s_setpc_b64 s[30:31] 15527; 15528; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15529; GFX940: ; %bb.0: 15530; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15531; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 15532; GFX940-NEXT: s_movk_i32 s0, 0xf800 15533; GFX940-NEXT: s_nop 0 15534; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 15535; GFX940-NEXT: flat_load_dword v3, v[4:5] 15536; GFX940-NEXT: s_mov_b32 s1, -1 15537; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 15538; GFX940-NEXT: s_mov_b64 s[2:3], 0 15539; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15540; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15541; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15542; GFX940-NEXT: s_mov_b32 s5, 0x7060302 15543; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start 15544; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 15545; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15546; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15547; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15548; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 15549; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 15550; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 15551; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 15552; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 15553; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 15554; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 15555; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 15556; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15557; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 15558; GFX940-NEXT: s_nop 0 15559; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15560; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 15561; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 15562; GFX940-NEXT: buffer_wbl2 sc1 15563; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 15564; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15565; GFX940-NEXT: buffer_inv sc1 15566; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15567; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 15568; GFX940-NEXT: v_mov_b32_e32 v3, v2 15569; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 15570; GFX940-NEXT: s_cbranch_execnz .LBB55_1 15571; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 15572; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 15573; GFX940-NEXT: s_setpc_b64 s[30:31] 15574; 15575; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15576; GFX11: ; %bb.0: 15577; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15578; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 15579; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 15580; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 15581; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 15582; GFX11-NEXT: flat_load_b32 v3, v[3:4] 15583; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15584; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15585; GFX11-NEXT: s_mov_b32 s1, 0 15586; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15587; GFX11-NEXT: .p2align 6 15588; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start 15589; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15590; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15591; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15592; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15593; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15594; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 15595; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 15596; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15597; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 15598; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 15599; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 15600; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 15601; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15602; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15603; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15604; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15605; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15606; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15607; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15608; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15609; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15610; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15611; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 15612; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15613; GFX11-NEXT: buffer_gl1_inv 15614; GFX11-NEXT: buffer_gl0_inv 15615; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15616; GFX11-NEXT: v_mov_b32_e32 v3, v2 15617; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15618; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15619; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15620; GFX11-NEXT: s_cbranch_execnz .LBB55_1 15621; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15622; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15623; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15624; GFX11-NEXT: s_setpc_b64 s[30:31] 15625; 15626; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15627; GFX10: ; %bb.0: 15628; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15629; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 15630; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 15631; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15632; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15633; GFX10-NEXT: s_mov_b32 s5, 0 15634; GFX10-NEXT: flat_load_dword v3, v[0:1] 15635; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start 15636; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15637; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15638; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15639; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15640; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 15641; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 15642; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 15643; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 15644; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 15645; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 15646; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15647; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15648; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15649; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 15650; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15651; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 15652; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15653; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15654; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15655; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15656; GFX10-NEXT: buffer_gl1_inv 15657; GFX10-NEXT: buffer_gl0_inv 15658; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15659; GFX10-NEXT: v_mov_b32_e32 v3, v2 15660; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 15661; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 15662; GFX10-NEXT: s_cbranch_execnz .LBB55_1 15663; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15664; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 15665; GFX10-NEXT: s_setpc_b64 s[30:31] 15666; 15667; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15668; GFX90A: ; %bb.0: 15669; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15670; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 15671; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 15672; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15673; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15674; GFX90A-NEXT: flat_load_dword v1, v[0:1] 15675; GFX90A-NEXT: s_mov_b64 s[6:7], 0 15676; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 15677; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 15678; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15679; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 15680; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start 15681; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15682; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15683; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 15684; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 15685; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 15686; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2 15687; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 15688; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 15689; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 15690; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 15691; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 15692; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 15693; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15694; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 15695; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 15696; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15697; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 15698; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 15699; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15700; GFX90A-NEXT: buffer_wbinvl1 15701; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15702; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15703; GFX90A-NEXT: v_mov_b32_e32 v1, v0 15704; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15705; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 15706; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15707; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15708; GFX90A-NEXT: s_setpc_b64 s[30:31] 15709; 15710; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15711; GFX908: ; %bb.0: 15712; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15713; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 15714; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 15715; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15716; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15717; GFX908-NEXT: flat_load_dword v1, v[0:1] 15718; GFX908-NEXT: s_mov_b64 s[6:7], 0 15719; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 15720; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15721; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15722; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15723; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start 15724; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15725; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15726; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 15727; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 15728; GFX908-NEXT: v_sub_f32_e32 v0, v0, v5 15729; GFX908-NEXT: v_sub_f32_e32 v6, v6, v2 15730; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 15731; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 15732; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 15733; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 15734; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 15735; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 15736; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15737; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 15738; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 15739; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15740; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 15741; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 15742; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15743; GFX908-NEXT: buffer_wbinvl1 15744; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15745; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15746; GFX908-NEXT: v_mov_b32_e32 v1, v0 15747; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15748; GFX908-NEXT: s_cbranch_execnz .LBB55_1 15749; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15750; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15751; GFX908-NEXT: s_setpc_b64 s[30:31] 15752; 15753; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15754; GFX8: ; %bb.0: 15755; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15756; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 15757; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 15758; GFX8-NEXT: flat_load_dword v3, v[0:1] 15759; GFX8-NEXT: s_mov_b64 s[6:7], 0 15760; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15761; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15762; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start 15763; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15764; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15765; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15766; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15767; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 15768; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 15769; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 15770; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 15771; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 15772; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 15773; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15774; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15775; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 15776; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15777; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 15778; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15779; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15780; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15781; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 15782; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 15783; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15784; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15785; GFX8-NEXT: buffer_wbinvl1 15786; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15787; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15788; GFX8-NEXT: v_mov_b32_e32 v3, v2 15789; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15790; GFX8-NEXT: s_cbranch_execnz .LBB55_1 15791; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15792; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15793; GFX8-NEXT: s_setpc_b64 s[30:31] 15794; 15795; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: 15796; GFX7: ; %bb.0: 15797; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15798; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 15799; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 15800; GFX7-NEXT: flat_load_dword v5, v[0:1] 15801; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 15802; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15803; GFX7-NEXT: s_mov_b64 s[4:5], 0 15804; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15805; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15806; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15807; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 15808; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 15809; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start 15810; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15811; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15812; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15813; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 15814; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 15815; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 15816; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 15817; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 15818; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 15819; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 15820; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 15821; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 15822; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15823; GFX7-NEXT: buffer_wbinvl1 15824; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 15825; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 15826; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15827; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 15828; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15829; GFX7-NEXT: s_cbranch_execnz .LBB55_1 15830; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15831; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15832; GFX7-NEXT: s_setpc_b64 s[30:31] 15833 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 15834 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst 15835 ret void 15836} 15837 15838define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { 15839; GFX12-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 15840; GFX12: ; %bb.0: 15841; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15842; GFX12-NEXT: s_wait_expcnt 0x0 15843; GFX12-NEXT: s_wait_samplecnt 0x0 15844; GFX12-NEXT: s_wait_bvhcnt 0x0 15845; GFX12-NEXT: s_wait_kmcnt 0x0 15846; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15847; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15848; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15849; GFX12-NEXT: s_mov_b32 s1, 0 15850; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start 15851; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15852; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15853; GFX12-NEXT: v_mov_b32_e32 v6, v3 15854; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15855; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15856; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 15857; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 15858; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15859; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 15860; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 15861; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 15862; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15863; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15864; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 15865; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 15866; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 15867; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 15868; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15869; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 15870; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 15871; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15872; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 15873; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 15874; GFX12-NEXT: global_wb scope:SCOPE_SYS 15875; GFX12-NEXT: s_wait_storecnt 0x0 15876; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 15877; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15878; GFX12-NEXT: global_inv scope:SCOPE_SYS 15879; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 15880; GFX12-NEXT: s_wait_alu 0xfffe 15881; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15882; GFX12-NEXT: s_wait_alu 0xfffe 15883; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15884; GFX12-NEXT: s_cbranch_execnz .LBB56_1 15885; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15886; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15887; GFX12-NEXT: v_mov_b32_e32 v0, v3 15888; GFX12-NEXT: s_wait_alu 0xfffe 15889; GFX12-NEXT: s_setpc_b64 s[30:31] 15890; 15891; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 15892; GFX940: ; %bb.0: 15893; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15894; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 15895; GFX940-NEXT: s_mov_b64 s[2:3], 0 15896; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15897; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15898; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15899; GFX940-NEXT: s_mov_b32 s5, 0x7060302 15900; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start 15901; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 15902; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15903; GFX940-NEXT: v_mov_b32_e32 v7, v3 15904; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 15905; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 15906; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 15907; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 15908; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 15909; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 15910; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 15911; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 15912; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 15913; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 15914; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 15915; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 15916; GFX940-NEXT: s_nop 0 15917; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 15918; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 15919; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 15920; GFX940-NEXT: buffer_wbl2 sc0 sc1 15921; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 15922; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15923; GFX940-NEXT: buffer_inv sc0 sc1 15924; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 15925; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 15926; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 15927; GFX940-NEXT: s_cbranch_execnz .LBB56_1 15928; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 15929; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 15930; GFX940-NEXT: v_mov_b32_e32 v0, v3 15931; GFX940-NEXT: s_setpc_b64 s[30:31] 15932; 15933; GFX11-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 15934; GFX11: ; %bb.0: 15935; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15936; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15937; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15938; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15939; GFX11-NEXT: s_mov_b32 s1, 0 15940; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15941; GFX11-NEXT: .p2align 6 15942; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start 15943; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15944; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15945; GFX11-NEXT: v_mov_b32_e32 v6, v3 15946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15947; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15948; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 15949; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 15950; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15951; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 15952; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 15953; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 15954; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15955; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15956; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 15957; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 15958; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 15959; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 15960; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15961; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 15962; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 15963; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15964; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 15965; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 15966; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15967; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 15968; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15969; GFX11-NEXT: buffer_gl1_inv 15970; GFX11-NEXT: buffer_gl0_inv 15971; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 15972; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15973; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15974; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15975; GFX11-NEXT: s_cbranch_execnz .LBB56_1 15976; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15977; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15978; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15979; GFX11-NEXT: v_mov_b32_e32 v0, v3 15980; GFX11-NEXT: s_setpc_b64 s[30:31] 15981; 15982; GFX10-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 15983; GFX10: ; %bb.0: 15984; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15985; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 15986; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 15987; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15988; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15989; GFX10-NEXT: s_mov_b32 s5, 0 15990; GFX10-NEXT: flat_load_dword v0, v[3:4] 15991; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start 15992; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15993; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15994; GFX10-NEXT: v_mov_b32_e32 v6, v0 15995; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15996; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15997; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 15998; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 15999; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 16000; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 16001; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 16002; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 16003; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16004; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 16005; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16006; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 16007; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16008; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 16009; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 16010; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16011; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 16012; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16013; GFX10-NEXT: buffer_gl1_inv 16014; GFX10-NEXT: buffer_gl0_inv 16015; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 16016; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16017; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16018; GFX10-NEXT: s_cbranch_execnz .LBB56_1 16019; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16020; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16021; GFX10-NEXT: s_setpc_b64 s[30:31] 16022; 16023; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 16024; GFX90A: ; %bb.0: 16025; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16026; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 16027; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16028; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16029; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16030; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16031; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16032; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start 16033; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16034; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16035; GFX90A-NEXT: v_mov_b32_e32 v7, v3 16036; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 16037; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 16038; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 16039; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 16040; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 16041; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 16042; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 16043; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 16044; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 16045; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 16046; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16047; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16048; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 16049; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16050; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 16051; GFX90A-NEXT: buffer_wbl2 16052; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 16053; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16054; GFX90A-NEXT: buffer_invl2 16055; GFX90A-NEXT: buffer_wbinvl1 16056; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 16057; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16058; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16059; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 16060; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16061; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16062; GFX90A-NEXT: v_mov_b32_e32 v0, v3 16063; GFX90A-NEXT: s_setpc_b64 s[30:31] 16064; 16065; GFX908-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 16066; GFX908: ; %bb.0: 16067; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16068; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 16069; GFX908-NEXT: s_mov_b64 s[6:7], 0 16070; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16071; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16072; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16073; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16074; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start 16075; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16076; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16077; GFX908-NEXT: v_mov_b32_e32 v6, v3 16078; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16079; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16080; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 16081; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 16082; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 16083; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 16084; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 16085; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 16086; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 16087; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 16088; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16089; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16090; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 16091; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16092; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 16093; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 16094; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16095; GFX908-NEXT: buffer_wbinvl1 16096; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 16097; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16098; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16099; GFX908-NEXT: s_cbranch_execnz .LBB56_1 16100; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16101; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16102; GFX908-NEXT: v_mov_b32_e32 v0, v3 16103; GFX908-NEXT: s_setpc_b64 s[30:31] 16104; 16105; GFX8-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 16106; GFX8: ; %bb.0: 16107; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16108; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 16109; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 16110; GFX8-NEXT: flat_load_dword v0, v[3:4] 16111; GFX8-NEXT: s_mov_b64 s[6:7], 0 16112; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 16113; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16114; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start 16115; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16116; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16117; GFX8-NEXT: v_mov_b32_e32 v6, v0 16118; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 16119; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16120; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 16121; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 16122; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 16123; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 16124; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 16125; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 16126; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16127; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16128; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 16129; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16130; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 16131; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 16132; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16133; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 16134; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16135; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 16136; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 16137; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16138; GFX8-NEXT: buffer_wbinvl1 16139; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 16140; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16141; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16142; GFX8-NEXT: s_cbranch_execnz .LBB56_1 16143; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16144; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16145; GFX8-NEXT: s_setpc_b64 s[30:31] 16146; 16147; GFX7-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: 16148; GFX7: ; %bb.0: 16149; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16150; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 16151; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 16152; GFX7-NEXT: flat_load_dword v0, v[4:5] 16153; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 16154; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 16155; GFX7-NEXT: s_mov_b64 s[4:5], 0 16156; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 16157; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16158; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16159; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 16160; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 16161; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start 16162; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16163; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 16164; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 16165; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 16166; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 16167; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 16168; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 16169; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 16170; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 16171; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 16172; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 16173; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 16174; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16175; GFX7-NEXT: buffer_wbinvl1 16176; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 16177; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 16178; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16179; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 16180; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16181; GFX7-NEXT: s_cbranch_execnz .LBB56_1 16182; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16183; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16184; GFX7-NEXT: s_setpc_b64 s[30:31] 16185 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 16186 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst 16187 ret <2 x bfloat> %result 16188} 16189 16190define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { 16191; GFX12-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16192; GFX12: ; %bb.0: 16193; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16194; GFX12-NEXT: s_wait_expcnt 0x0 16195; GFX12-NEXT: s_wait_samplecnt 0x0 16196; GFX12-NEXT: s_wait_bvhcnt 0x0 16197; GFX12-NEXT: s_wait_kmcnt 0x0 16198; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16199; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16200; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16201; GFX12-NEXT: s_mov_b32 s1, 0 16202; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start 16203; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 16204; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16205; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16206; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16207; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16208; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 16209; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 16210; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16211; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 16212; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 16213; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 16214; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 16215; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16216; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16217; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16218; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 16219; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16220; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16221; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 16222; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 16223; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16224; GFX12-NEXT: global_wb scope:SCOPE_SYS 16225; GFX12-NEXT: s_wait_storecnt 0x0 16226; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 16227; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16228; GFX12-NEXT: global_inv scope:SCOPE_SYS 16229; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16230; GFX12-NEXT: v_mov_b32_e32 v3, v2 16231; GFX12-NEXT: s_wait_alu 0xfffe 16232; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 16233; GFX12-NEXT: s_wait_alu 0xfffe 16234; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16235; GFX12-NEXT: s_cbranch_execnz .LBB57_1 16236; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 16237; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 16238; GFX12-NEXT: s_wait_alu 0xfffe 16239; GFX12-NEXT: s_setpc_b64 s[30:31] 16240; 16241; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16242; GFX940: ; %bb.0: 16243; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16244; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 16245; GFX940-NEXT: s_mov_b64 s[2:3], 0 16246; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16247; GFX940-NEXT: s_movk_i32 s4, 0x7fff 16248; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16249; GFX940-NEXT: s_mov_b32 s5, 0x7060302 16250; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start 16251; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 16252; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16253; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16254; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16255; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 16256; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 16257; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 16258; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 16259; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 16260; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 16261; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 16262; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 16263; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16264; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 16265; GFX940-NEXT: s_nop 0 16266; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16267; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 16268; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 16269; GFX940-NEXT: buffer_wbl2 sc0 sc1 16270; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 16271; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16272; GFX940-NEXT: buffer_inv sc0 sc1 16273; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16274; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 16275; GFX940-NEXT: v_mov_b32_e32 v3, v2 16276; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 16277; GFX940-NEXT: s_cbranch_execnz .LBB57_1 16278; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 16279; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 16280; GFX940-NEXT: s_setpc_b64 s[30:31] 16281; 16282; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16283; GFX11: ; %bb.0: 16284; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16285; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16286; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16287; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16288; GFX11-NEXT: s_mov_b32 s1, 0 16289; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16290; GFX11-NEXT: .p2align 6 16291; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start 16292; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16293; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16294; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16295; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16296; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16297; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 16298; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 16299; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16300; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 16301; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 16302; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 16303; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 16304; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16305; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16306; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16307; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 16308; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16309; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16310; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 16311; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 16312; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16313; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16314; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 16315; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16316; GFX11-NEXT: buffer_gl1_inv 16317; GFX11-NEXT: buffer_gl0_inv 16318; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16319; GFX11-NEXT: v_mov_b32_e32 v3, v2 16320; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16321; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16322; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16323; GFX11-NEXT: s_cbranch_execnz .LBB57_1 16324; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16325; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16326; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16327; GFX11-NEXT: s_setpc_b64 s[30:31] 16328; 16329; GFX10-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16330; GFX10: ; %bb.0: 16331; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16332; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 16333; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 16334; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16335; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16336; GFX10-NEXT: s_mov_b32 s5, 0 16337; GFX10-NEXT: flat_load_dword v3, v[0:1] 16338; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start 16339; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16340; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16341; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16342; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16343; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 16344; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 16345; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 16346; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 16347; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 16348; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 16349; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16350; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16351; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16352; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 16353; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16354; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 16355; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16356; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16357; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16358; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16359; GFX10-NEXT: buffer_gl1_inv 16360; GFX10-NEXT: buffer_gl0_inv 16361; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16362; GFX10-NEXT: v_mov_b32_e32 v3, v2 16363; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16364; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16365; GFX10-NEXT: s_cbranch_execnz .LBB57_1 16366; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16367; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16368; GFX10-NEXT: s_setpc_b64 s[30:31] 16369; 16370; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16371; GFX90A: ; %bb.0: 16372; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16373; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 16374; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16375; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16376; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16377; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16378; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16379; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start 16380; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16381; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16382; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16383; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16384; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 16385; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 16386; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 16387; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 16388; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 16389; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 16390; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 16391; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 16392; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16393; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16394; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16395; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16396; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 16397; GFX90A-NEXT: buffer_wbl2 16398; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 16399; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16400; GFX90A-NEXT: buffer_invl2 16401; GFX90A-NEXT: buffer_wbinvl1 16402; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16403; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16404; GFX90A-NEXT: v_mov_b32_e32 v3, v2 16405; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16406; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 16407; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16408; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16409; GFX90A-NEXT: s_setpc_b64 s[30:31] 16410; 16411; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16412; GFX908: ; %bb.0: 16413; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16414; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 16415; GFX908-NEXT: s_mov_b64 s[6:7], 0 16416; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16417; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16418; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16419; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16420; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start 16421; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16422; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16423; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16424; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16425; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 16426; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 16427; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 16428; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 16429; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 16430; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 16431; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 16432; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 16433; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16434; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16435; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16436; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16437; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 16438; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 16439; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16440; GFX908-NEXT: buffer_wbinvl1 16441; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16442; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16443; GFX908-NEXT: v_mov_b32_e32 v3, v2 16444; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16445; GFX908-NEXT: s_cbranch_execnz .LBB57_1 16446; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16447; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16448; GFX908-NEXT: s_setpc_b64 s[30:31] 16449; 16450; GFX8-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16451; GFX8: ; %bb.0: 16452; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16453; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 16454; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16455; GFX8-NEXT: flat_load_dword v3, v[0:1] 16456; GFX8-NEXT: s_mov_b64 s[6:7], 0 16457; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16458; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16459; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start 16460; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16461; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16462; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16463; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16464; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 16465; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 16466; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 16467; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 16468; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 16469; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 16470; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16471; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16472; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 16473; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16474; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 16475; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16476; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16477; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16478; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 16479; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 16480; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16481; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16482; GFX8-NEXT: buffer_wbinvl1 16483; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16484; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16485; GFX8-NEXT: v_mov_b32_e32 v3, v2 16486; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16487; GFX8-NEXT: s_cbranch_execnz .LBB57_1 16488; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16489; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16490; GFX8-NEXT: s_setpc_b64 s[30:31] 16491; 16492; GFX7-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: 16493; GFX7: ; %bb.0: 16494; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16495; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 16496; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16497; GFX7-NEXT: flat_load_dword v5, v[0:1] 16498; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 16499; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 16500; GFX7-NEXT: s_mov_b64 s[4:5], 0 16501; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16502; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16503; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16504; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 16505; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 16506; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start 16507; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16508; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 16509; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 16510; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 16511; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 16512; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 16513; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 16514; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 16515; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 16516; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 16517; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 16518; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc 16519; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16520; GFX7-NEXT: buffer_wbinvl1 16521; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 16522; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 16523; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16524; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 16525; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16526; GFX7-NEXT: s_cbranch_execnz .LBB57_1 16527; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16528; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16529; GFX7-NEXT: s_setpc_b64 s[30:31] 16530 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 16531 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst 16532 ret void 16533} 16534 16535attributes #0 = { nounwind } 16536attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 16537