1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s 7 8define float @syncscope_system(ptr %addr, float %val) #0 { 9; GFX908-LABEL: syncscope_system: 10; GFX908: ; %bb.0: 11; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX908-NEXT: flat_load_dword v3, v[0:1] 13; GFX908-NEXT: s_mov_b64 s[4:5], 0 14; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 15; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17; GFX908-NEXT: v_mov_b32_e32 v4, v3 18; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 19; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 20; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 21; GFX908-NEXT: buffer_wbinvl1_vol 22; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 23; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 24; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 25; GFX908-NEXT: s_cbranch_execnz .LBB0_1 26; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 27; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 28; GFX908-NEXT: v_mov_b32_e32 v0, v3 29; GFX908-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX90A-LABEL: syncscope_system: 32; GFX90A: ; %bb.0: 33; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 35; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 36; GFX90A-NEXT: ; implicit-def: $vgpr3 37; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 38; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 39; GFX90A-NEXT: s_cbranch_execz .LBB0_6 40; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private 41; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 42; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 43; GFX90A-NEXT: ; implicit-def: $vgpr3 44; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 45; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 46; GFX90A-NEXT: s_cbranch_execz .LBB0_3 47; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global 48; GFX90A-NEXT: buffer_wbl2 49; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc 50; GFX90A-NEXT: s_waitcnt vmcnt(0) 51; GFX90A-NEXT: buffer_invl2 52; GFX90A-NEXT: buffer_wbinvl1_vol 53; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 54; GFX90A-NEXT: ; implicit-def: $vgpr2 55; GFX90A-NEXT: .LBB0_3: ; %Flow 56; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 57; GFX90A-NEXT: s_cbranch_execz .LBB0_5 58; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private 59; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 60; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 61; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 62; GFX90A-NEXT: s_waitcnt vmcnt(0) 63; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 64; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 65; GFX90A-NEXT: .LBB0_5: ; %Flow1 66; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 67; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 68; GFX90A-NEXT: ; implicit-def: $vgpr2 69; GFX90A-NEXT: .LBB0_6: ; %Flow2 70; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 71; GFX90A-NEXT: s_cbranch_execz .LBB0_8 72; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared 73; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 74; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 75; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 76; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 77; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.phi 78; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 79; GFX90A-NEXT: v_mov_b32_e32 v0, v3 80; GFX90A-NEXT: s_waitcnt vmcnt(0) 81; GFX90A-NEXT: s_setpc_b64 s[30:31] 82; 83; GFX940-LABEL: syncscope_system: 84; GFX940: ; %bb.0: 85; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GFX940-NEXT: buffer_wbl2 sc0 sc1 87; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 88; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 89; GFX940-NEXT: buffer_inv sc0 sc1 90; GFX940-NEXT: s_setpc_b64 s[30:31] 91; 92; GFX1100-LABEL: syncscope_system: 93; GFX1100: ; %bb.0: 94; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 95; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 96; GFX1100-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 97; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 98; GFX1100-NEXT: buffer_gl1_inv 99; GFX1100-NEXT: buffer_gl0_inv 100; GFX1100-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX1200-LABEL: syncscope_system: 103; GFX1200: ; %bb.0: 104; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 105; GFX1200-NEXT: s_wait_expcnt 0x0 106; GFX1200-NEXT: s_wait_samplecnt 0x0 107; GFX1200-NEXT: s_wait_bvhcnt 0x0 108; GFX1200-NEXT: s_wait_kmcnt 0x0 109; GFX1200-NEXT: global_wb scope:SCOPE_SYS 110; GFX1200-NEXT: s_wait_storecnt 0x0 111; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 112; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 113; GFX1200-NEXT: global_inv scope:SCOPE_SYS 114; GFX1200-NEXT: s_setpc_b64 s[30:31] 115 %res = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 116 ret float %res 117} 118 119define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { 120; GFX908-LABEL: syncscope_workgroup_rtn: 121; GFX908: ; %bb.0: 122; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX908-NEXT: flat_load_dword v3, v[0:1] 124; GFX908-NEXT: s_mov_b64 s[4:5], 0 125; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start 126; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 127; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 128; GFX908-NEXT: v_mov_b32_e32 v4, v3 129; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 130; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 131; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 132; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 133; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 134; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 135; GFX908-NEXT: s_cbranch_execnz .LBB1_1 136; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 137; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 138; GFX908-NEXT: v_mov_b32_e32 v0, v3 139; GFX908-NEXT: s_setpc_b64 s[30:31] 140; 141; GFX90A-LABEL: syncscope_workgroup_rtn: 142; GFX90A: ; %bb.0: 143; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 145; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 146; GFX90A-NEXT: ; implicit-def: $vgpr3 147; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 148; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 149; GFX90A-NEXT: s_cbranch_execz .LBB1_6 150; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private 151; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 152; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 153; GFX90A-NEXT: ; implicit-def: $vgpr3 154; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 155; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 156; GFX90A-NEXT: s_cbranch_execz .LBB1_3 157; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global 158; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc 159; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 160; GFX90A-NEXT: ; implicit-def: $vgpr2 161; GFX90A-NEXT: .LBB1_3: ; %Flow 162; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 163; GFX90A-NEXT: s_cbranch_execz .LBB1_5 164; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private 165; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 166; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 167; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 168; GFX90A-NEXT: s_waitcnt vmcnt(0) 169; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 170; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 171; GFX90A-NEXT: .LBB1_5: ; %Flow1 172; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 173; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 174; GFX90A-NEXT: ; implicit-def: $vgpr2 175; GFX90A-NEXT: .LBB1_6: ; %Flow2 176; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 177; GFX90A-NEXT: s_cbranch_execz .LBB1_8 178; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared 179; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 180; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 181; GFX90A-NEXT: s_waitcnt vmcnt(0) 182; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 183; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 184; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi 185; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 186; GFX90A-NEXT: s_waitcnt vmcnt(0) 187; GFX90A-NEXT: v_mov_b32_e32 v0, v3 188; GFX90A-NEXT: s_setpc_b64 s[30:31] 189; 190; GFX940-LABEL: syncscope_workgroup_rtn: 191; GFX940: ; %bb.0: 192; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 193; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 194; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 195; GFX940-NEXT: s_setpc_b64 s[30:31] 196; 197; GFX1100-LABEL: syncscope_workgroup_rtn: 198; GFX1100: ; %bb.0: 199; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 200; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 201; GFX1100-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc 202; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 203; GFX1100-NEXT: buffer_gl0_inv 204; GFX1100-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX1200-LABEL: syncscope_workgroup_rtn: 207; GFX1200: ; %bb.0: 208; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 209; GFX1200-NEXT: s_wait_expcnt 0x0 210; GFX1200-NEXT: s_wait_samplecnt 0x0 211; GFX1200-NEXT: s_wait_bvhcnt 0x0 212; GFX1200-NEXT: s_wait_kmcnt 0x0 213; GFX1200-NEXT: s_wait_storecnt 0x0 214; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 215; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 216; GFX1200-NEXT: global_inv scope:SCOPE_SE 217; GFX1200-NEXT: s_setpc_b64 s[30:31] 218 %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 219 ret float %res 220} 221 222define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { 223; GFX908-LABEL: syncscope_workgroup_nortn: 224; GFX908: ; %bb.0: 225; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 226; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base 227; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 228; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 229; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 230; GFX908-NEXT: s_cbranch_execnz .LBB2_3 231; GFX908-NEXT: ; %bb.1: ; %Flow2 232; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 233; GFX908-NEXT: s_cbranch_execnz .LBB2_8 234; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi 235; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 236; GFX908-NEXT: s_waitcnt vmcnt(0) 237; GFX908-NEXT: s_setpc_b64 s[30:31] 238; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private 239; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base 240; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 241; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc 242; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 243; GFX908-NEXT: s_cbranch_execz .LBB2_5 244; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global 245; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off 246; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 247; GFX908-NEXT: ; implicit-def: $vgpr2 248; GFX908-NEXT: .LBB2_5: ; %Flow 249; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 250; GFX908-NEXT: s_cbranch_execz .LBB2_7 251; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private 252; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 253; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 254; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 255; GFX908-NEXT: s_waitcnt vmcnt(0) 256; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 257; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 258; GFX908-NEXT: .LBB2_7: ; %Flow1 259; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 260; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 261; GFX908-NEXT: ; implicit-def: $vgpr2 262; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 263; GFX908-NEXT: s_cbranch_execz .LBB2_2 264; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared 265; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 266; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 267; GFX908-NEXT: ds_add_f32 v0, v2 268; GFX908-NEXT: s_waitcnt lgkmcnt(0) 269; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 270; GFX908-NEXT: s_waitcnt vmcnt(0) 271; GFX908-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX90A-LABEL: syncscope_workgroup_nortn: 274; GFX90A: ; %bb.0: 275; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base 277; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 278; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 279; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 280; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 281; GFX90A-NEXT: ; %bb.1: ; %Flow2 282; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 283; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 284; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi 285; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 286; GFX90A-NEXT: s_waitcnt vmcnt(0) 287; GFX90A-NEXT: s_setpc_b64 s[30:31] 288; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private 289; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base 290; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 291; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc 292; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] 293; GFX90A-NEXT: s_cbranch_execz .LBB2_5 294; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global 295; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off 296; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 297; GFX90A-NEXT: ; implicit-def: $vgpr2 298; GFX90A-NEXT: .LBB2_5: ; %Flow 299; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] 300; GFX90A-NEXT: s_cbranch_execz .LBB2_7 301; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private 302; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 303; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 304; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 305; GFX90A-NEXT: s_waitcnt vmcnt(0) 306; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 307; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 308; GFX90A-NEXT: .LBB2_7: ; %Flow1 309; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 310; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 311; GFX90A-NEXT: ; implicit-def: $vgpr2 312; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 313; GFX90A-NEXT: s_cbranch_execz .LBB2_2 314; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared 315; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 316; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 317; GFX90A-NEXT: ds_add_f32 v0, v2 318; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 319; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 320; GFX90A-NEXT: s_waitcnt vmcnt(0) 321; GFX90A-NEXT: s_setpc_b64 s[30:31] 322; 323; GFX940-LABEL: syncscope_workgroup_nortn: 324; GFX940: ; %bb.0: 325; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 326; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 327; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 328; GFX940-NEXT: s_setpc_b64 s[30:31] 329; 330; GFX1100-LABEL: syncscope_workgroup_nortn: 331; GFX1100: ; %bb.0: 332; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 333; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 334; GFX1100-NEXT: flat_atomic_add_f32 v[0:1], v2 335; GFX1100-NEXT: s_waitcnt lgkmcnt(0) 336; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 337; GFX1100-NEXT: buffer_gl0_inv 338; GFX1100-NEXT: s_setpc_b64 s[30:31] 339; 340; GFX1200-LABEL: syncscope_workgroup_nortn: 341; GFX1200: ; %bb.0: 342; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 343; GFX1200-NEXT: s_wait_expcnt 0x0 344; GFX1200-NEXT: s_wait_samplecnt 0x0 345; GFX1200-NEXT: s_wait_bvhcnt 0x0 346; GFX1200-NEXT: s_wait_kmcnt 0x0 347; GFX1200-NEXT: s_wait_storecnt 0x0 348; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE 349; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 350; GFX1200-NEXT: global_inv scope:SCOPE_SE 351; GFX1200-NEXT: s_setpc_b64 s[30:31] 352 %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 353 ret void 354} 355 356define float @no_unsafe(ptr %addr, float %val) { 357; GFX908-LABEL: no_unsafe: 358; GFX908: ; %bb.0: 359; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 360; GFX908-NEXT: flat_load_dword v3, v[0:1] 361; GFX908-NEXT: s_mov_b64 s[4:5], 0 362; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start 363; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 364; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 365; GFX908-NEXT: v_mov_b32_e32 v4, v3 366; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 367; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 368; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 369; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 370; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 371; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 372; GFX908-NEXT: s_cbranch_execnz .LBB3_1 373; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 374; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 375; GFX908-NEXT: v_mov_b32_e32 v0, v3 376; GFX908-NEXT: s_setpc_b64 s[30:31] 377; 378; GFX90A-LABEL: no_unsafe: 379; GFX90A: ; %bb.0: 380; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 381; GFX90A-NEXT: flat_load_dword v3, v[0:1] 382; GFX90A-NEXT: s_mov_b64 s[4:5], 0 383; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start 384; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 385; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 386; GFX90A-NEXT: v_mov_b32_e32 v5, v3 387; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 388; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 389; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 390; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 391; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 392; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 393; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 394; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 395; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 396; GFX90A-NEXT: v_mov_b32_e32 v0, v3 397; GFX90A-NEXT: s_setpc_b64 s[30:31] 398; 399; GFX940-LABEL: no_unsafe: 400; GFX940: ; %bb.0: 401; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 402; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 403; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 404; GFX940-NEXT: s_setpc_b64 s[30:31] 405; 406; GFX1100-LABEL: no_unsafe: 407; GFX1100: ; %bb.0: 408; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 409; GFX1100-NEXT: flat_load_b32 v3, v[0:1] 410; GFX1100-NEXT: s_mov_b32 s0, 0 411; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start 412; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 413; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 414; GFX1100-NEXT: v_mov_b32_e32 v4, v3 415; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) 416; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 417; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 418; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 419; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 420; GFX1100-NEXT: buffer_gl0_inv 421; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 422; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 423; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 424; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 425; GFX1100-NEXT: s_cbranch_execnz .LBB3_1 426; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end 427; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 428; GFX1100-NEXT: v_mov_b32_e32 v0, v3 429; GFX1100-NEXT: s_setpc_b64 s[30:31] 430; 431; GFX1200-LABEL: no_unsafe: 432; GFX1200: ; %bb.0: 433; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 434; GFX1200-NEXT: s_wait_expcnt 0x0 435; GFX1200-NEXT: s_wait_samplecnt 0x0 436; GFX1200-NEXT: s_wait_bvhcnt 0x0 437; GFX1200-NEXT: s_wait_kmcnt 0x0 438; GFX1200-NEXT: s_wait_storecnt 0x0 439; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 440; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 441; GFX1200-NEXT: global_inv scope:SCOPE_SE 442; GFX1200-NEXT: s_setpc_b64 s[30:31] 443 %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst 444 ret float %res 445} 446 447attributes #0 = { nounwind } 448 449!0 = !{} 450