1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s 12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s 13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s 14 15define amdgpu_kernel void @flat_workgroup_unordered_load( 16; GFX7-LABEL: flat_workgroup_unordered_load: 17; GFX7: ; %bb.0: ; %entry 18; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 19; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: v_mov_b32_e32 v0, s6 22; GFX7-NEXT: v_mov_b32_e32 v1, s7 23; GFX7-NEXT: flat_load_dword v2, v[0:1] 24; GFX7-NEXT: v_mov_b32_e32 v0, s4 25; GFX7-NEXT: v_mov_b32_e32 v1, s5 26; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 27; GFX7-NEXT: flat_store_dword v[0:1], v2 28; GFX7-NEXT: s_endpgm 29; 30; GFX10-WGP-LABEL: flat_workgroup_unordered_load: 31; GFX10-WGP: ; %bb.0: ; %entry 32; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 33; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 34; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 35; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 36; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 37; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 38; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 39; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 40; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 41; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 42; GFX10-WGP-NEXT: s_endpgm 43; 44; GFX10-CU-LABEL: flat_workgroup_unordered_load: 45; GFX10-CU: ; %bb.0: ; %entry 46; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 47; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 48; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 49; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 50; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 51; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 52; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 53; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 54; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 55; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 56; GFX10-CU-NEXT: s_endpgm 57; 58; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load: 59; SKIP-CACHE-INV: ; %bb.0: ; %entry 60; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 61; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 62; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 63; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 64; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 65; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 66; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 67; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 68; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 69; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 70; SKIP-CACHE-INV-NEXT: s_endpgm 71; 72; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: 73; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 74; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 75; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 76; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 77; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 78; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 79; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 80; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 81; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 82; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 83; 84; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: 85; GFX90A-TGSPLIT: ; %bb.0: ; %entry 86; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 87; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 88; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 89; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 90; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 91; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 92; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 93; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 94; GFX90A-TGSPLIT-NEXT: s_endpgm 95; 96; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: 97; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 98; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 99; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 100; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 101; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 102; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 103; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 104; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 105; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 106; GFX940-NOTTGSPLIT-NEXT: s_endpgm 107; 108; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load: 109; GFX940-TGSPLIT: ; %bb.0: ; %entry 110; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 111; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 112; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 113; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 114; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 115; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 116; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 117; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 118; GFX940-TGSPLIT-NEXT: s_endpgm 119; 120; GFX11-WGP-LABEL: flat_workgroup_unordered_load: 121; GFX11-WGP: ; %bb.0: ; %entry 122; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 123; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 124; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 125; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 126; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 127; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 128; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 129; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 130; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 131; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 132; GFX11-WGP-NEXT: s_endpgm 133; 134; GFX11-CU-LABEL: flat_workgroup_unordered_load: 135; GFX11-CU: ; %bb.0: ; %entry 136; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 137; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 138; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 139; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 140; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 141; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 142; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 143; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 144; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 145; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 146; GFX11-CU-NEXT: s_endpgm 147; 148; GFX12-WGP-LABEL: flat_workgroup_unordered_load: 149; GFX12-WGP: ; %bb.0: ; %entry 150; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 151; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 152; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 153; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 154; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 155; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 156; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 157; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 158; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 159; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 160; GFX12-WGP-NEXT: s_endpgm 161; 162; GFX12-CU-LABEL: flat_workgroup_unordered_load: 163; GFX12-CU: ; %bb.0: ; %entry 164; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 165; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 166; GFX12-CU-NEXT: s_wait_kmcnt 0x0 167; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 168; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 169; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 170; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 171; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 172; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 173; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 174; GFX12-CU-NEXT: s_endpgm 175 ptr %in, ptr %out) { 176entry: 177 %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4 178 store i32 %val, ptr %out 179 ret void 180} 181 182define amdgpu_kernel void @flat_workgroup_monotonic_load( 183; GFX7-LABEL: flat_workgroup_monotonic_load: 184; GFX7: ; %bb.0: ; %entry 185; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 186; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 187; GFX7-NEXT: s_waitcnt lgkmcnt(0) 188; GFX7-NEXT: v_mov_b32_e32 v0, s6 189; GFX7-NEXT: v_mov_b32_e32 v1, s7 190; GFX7-NEXT: flat_load_dword v2, v[0:1] 191; GFX7-NEXT: v_mov_b32_e32 v0, s4 192; GFX7-NEXT: v_mov_b32_e32 v1, s5 193; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 194; GFX7-NEXT: flat_store_dword v[0:1], v2 195; GFX7-NEXT: s_endpgm 196; 197; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: 198; GFX10-WGP: ; %bb.0: ; %entry 199; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 200; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 201; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 202; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 203; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 204; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 205; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 206; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 207; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 208; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 209; GFX10-WGP-NEXT: s_endpgm 210; 211; GFX10-CU-LABEL: flat_workgroup_monotonic_load: 212; GFX10-CU: ; %bb.0: ; %entry 213; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 214; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 215; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 216; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 217; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 218; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 219; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 220; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 221; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 222; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 223; GFX10-CU-NEXT: s_endpgm 224; 225; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load: 226; SKIP-CACHE-INV: ; %bb.0: ; %entry 227; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 229; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 230; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 232; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 235; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 236; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 237; SKIP-CACHE-INV-NEXT: s_endpgm 238; 239; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: 240; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 241; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 242; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 243; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 244; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 245; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 246; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 247; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 248; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 249; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 250; 251; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: 252; GFX90A-TGSPLIT: ; %bb.0: ; %entry 253; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 254; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 255; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 256; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 257; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 258; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 259; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 260; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 261; GFX90A-TGSPLIT-NEXT: s_endpgm 262; 263; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: 264; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 265; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 266; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 267; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 268; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 269; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 270; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 271; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 272; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 273; GFX940-NOTTGSPLIT-NEXT: s_endpgm 274; 275; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load: 276; GFX940-TGSPLIT: ; %bb.0: ; %entry 277; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 278; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 279; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 280; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 281; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 282; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 283; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 284; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 285; GFX940-TGSPLIT-NEXT: s_endpgm 286; 287; GFX11-WGP-LABEL: flat_workgroup_monotonic_load: 288; GFX11-WGP: ; %bb.0: ; %entry 289; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 290; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 291; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 292; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 293; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 294; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 295; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 296; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 297; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 299; GFX11-WGP-NEXT: s_endpgm 300; 301; GFX11-CU-LABEL: flat_workgroup_monotonic_load: 302; GFX11-CU: ; %bb.0: ; %entry 303; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 304; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 305; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 306; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 307; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 308; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 309; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 310; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 311; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 312; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 313; GFX11-CU-NEXT: s_endpgm 314; 315; GFX12-WGP-LABEL: flat_workgroup_monotonic_load: 316; GFX12-WGP: ; %bb.0: ; %entry 317; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 318; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 319; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 320; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 321; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 322; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 323; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 324; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 325; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 326; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 327; GFX12-WGP-NEXT: s_endpgm 328; 329; GFX12-CU-LABEL: flat_workgroup_monotonic_load: 330; GFX12-CU: ; %bb.0: ; %entry 331; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 332; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 333; GFX12-CU-NEXT: s_wait_kmcnt 0x0 334; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 335; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 336; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 337; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 338; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 339; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 340; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 341; GFX12-CU-NEXT: s_endpgm 342 ptr %in, ptr %out) { 343entry: 344 %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4 345 store i32 %val, ptr %out 346 ret void 347} 348 349define amdgpu_kernel void @flat_workgroup_acquire_load( 350; GFX7-LABEL: flat_workgroup_acquire_load: 351; GFX7: ; %bb.0: ; %entry 352; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 353; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 354; GFX7-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7-NEXT: v_mov_b32_e32 v0, s6 356; GFX7-NEXT: v_mov_b32_e32 v1, s7 357; GFX7-NEXT: flat_load_dword v2, v[0:1] 358; GFX7-NEXT: s_waitcnt lgkmcnt(0) 359; GFX7-NEXT: v_mov_b32_e32 v0, s4 360; GFX7-NEXT: v_mov_b32_e32 v1, s5 361; GFX7-NEXT: s_waitcnt vmcnt(0) 362; GFX7-NEXT: flat_store_dword v[0:1], v2 363; GFX7-NEXT: s_endpgm 364; 365; GFX10-WGP-LABEL: flat_workgroup_acquire_load: 366; GFX10-WGP: ; %bb.0: ; %entry 367; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 368; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 369; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 370; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 371; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 372; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 373; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 374; GFX10-WGP-NEXT: buffer_gl0_inv 375; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 376; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 377; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 378; GFX10-WGP-NEXT: s_endpgm 379; 380; GFX10-CU-LABEL: flat_workgroup_acquire_load: 381; GFX10-CU: ; %bb.0: ; %entry 382; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 383; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 384; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 385; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 386; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 387; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 388; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 389; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 390; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 391; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 392; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 393; GFX10-CU-NEXT: s_endpgm 394; 395; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load: 396; SKIP-CACHE-INV: ; %bb.0: ; %entry 397; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 398; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 399; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 401; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 402; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 403; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 404; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 405; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 406; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 407; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 408; SKIP-CACHE-INV-NEXT: s_endpgm 409; 410; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: 411; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 412; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 413; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 414; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 415; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 416; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 417; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 418; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 419; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 420; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 421; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 422; 423; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: 424; GFX90A-TGSPLIT: ; %bb.0: ; %entry 425; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 426; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 427; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 428; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 429; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 430; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 431; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 432; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 433; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 434; GFX90A-TGSPLIT-NEXT: s_endpgm 435; 436; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: 437; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 438; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 439; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 440; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 441; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 442; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 443; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 444; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 445; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 446; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 447; GFX940-NOTTGSPLIT-NEXT: s_endpgm 448; 449; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load: 450; GFX940-TGSPLIT: ; %bb.0: ; %entry 451; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 452; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 453; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 454; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 455; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 456; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 457; GFX940-TGSPLIT-NEXT: buffer_inv sc0 458; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 459; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 460; GFX940-TGSPLIT-NEXT: s_endpgm 461; 462; GFX11-WGP-LABEL: flat_workgroup_acquire_load: 463; GFX11-WGP: ; %bb.0: ; %entry 464; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 465; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 466; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 467; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 468; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 469; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 470; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 471; GFX11-WGP-NEXT: buffer_gl0_inv 472; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 473; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 474; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 475; GFX11-WGP-NEXT: s_endpgm 476; 477; GFX11-CU-LABEL: flat_workgroup_acquire_load: 478; GFX11-CU: ; %bb.0: ; %entry 479; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 480; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 481; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 482; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 483; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 484; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 485; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 486; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 487; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 488; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 489; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 490; GFX11-CU-NEXT: s_endpgm 491; 492; GFX12-WGP-LABEL: flat_workgroup_acquire_load: 493; GFX12-WGP: ; %bb.0: ; %entry 494; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 495; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 496; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 497; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 498; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 499; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 500; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 501; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 502; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 503; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 504; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 505; GFX12-WGP-NEXT: s_endpgm 506; 507; GFX12-CU-LABEL: flat_workgroup_acquire_load: 508; GFX12-CU: ; %bb.0: ; %entry 509; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 510; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 511; GFX12-CU-NEXT: s_wait_kmcnt 0x0 512; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 513; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 514; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 515; GFX12-CU-NEXT: s_wait_dscnt 0x0 516; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 517; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 518; GFX12-CU-NEXT: s_wait_loadcnt 0x0 519; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 520; GFX12-CU-NEXT: s_endpgm 521 ptr %in, ptr %out) { 522entry: 523 %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4 524 store i32 %val, ptr %out 525 ret void 526} 527 528define amdgpu_kernel void @flat_workgroup_seq_cst_load( 529; GFX7-LABEL: flat_workgroup_seq_cst_load: 530; GFX7: ; %bb.0: ; %entry 531; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 532; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 533; GFX7-NEXT: s_waitcnt lgkmcnt(0) 534; GFX7-NEXT: v_mov_b32_e32 v0, s6 535; GFX7-NEXT: v_mov_b32_e32 v1, s7 536; GFX7-NEXT: s_waitcnt lgkmcnt(0) 537; GFX7-NEXT: flat_load_dword v2, v[0:1] 538; GFX7-NEXT: s_waitcnt lgkmcnt(0) 539; GFX7-NEXT: v_mov_b32_e32 v0, s4 540; GFX7-NEXT: v_mov_b32_e32 v1, s5 541; GFX7-NEXT: s_waitcnt vmcnt(0) 542; GFX7-NEXT: flat_store_dword v[0:1], v2 543; GFX7-NEXT: s_endpgm 544; 545; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: 546; GFX10-WGP: ; %bb.0: ; %entry 547; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 548; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 549; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 550; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 551; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 552; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 553; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 554; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 555; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 556; GFX10-WGP-NEXT: buffer_gl0_inv 557; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 558; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 559; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 560; GFX10-WGP-NEXT: s_endpgm 561; 562; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: 563; GFX10-CU: ; %bb.0: ; %entry 564; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 565; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 566; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 567; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 568; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 569; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 570; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 571; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 572; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 573; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 574; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 575; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 576; GFX10-CU-NEXT: s_endpgm 577; 578; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load: 579; SKIP-CACHE-INV: ; %bb.0: ; %entry 580; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 581; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 582; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 583; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 584; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 585; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 586; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 587; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 588; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 590; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 591; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 592; SKIP-CACHE-INV-NEXT: s_endpgm 593; 594; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: 595; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 596; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 597; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 598; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 599; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 600; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 601; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 602; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 603; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 604; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 605; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 606; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 607; 608; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: 609; GFX90A-TGSPLIT: ; %bb.0: ; %entry 610; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 611; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 612; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 613; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 614; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 615; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 616; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 617; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 618; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 619; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 620; GFX90A-TGSPLIT-NEXT: s_endpgm 621; 622; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: 623; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 624; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 625; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 626; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 627; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 628; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 629; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 630; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 631; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 632; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 633; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 634; GFX940-NOTTGSPLIT-NEXT: s_endpgm 635; 636; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: 637; GFX940-TGSPLIT: ; %bb.0: ; %entry 638; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 639; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 640; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 641; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 642; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 643; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 644; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 645; GFX940-TGSPLIT-NEXT: buffer_inv sc0 646; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 647; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 648; GFX940-TGSPLIT-NEXT: s_endpgm 649; 650; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load: 651; GFX11-WGP: ; %bb.0: ; %entry 652; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 653; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 654; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 655; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 656; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 657; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 658; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 659; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 660; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX11-WGP-NEXT: buffer_gl0_inv 662; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 663; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 664; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 665; GFX11-WGP-NEXT: s_endpgm 666; 667; GFX11-CU-LABEL: flat_workgroup_seq_cst_load: 668; GFX11-CU: ; %bb.0: ; %entry 669; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 670; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 671; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 672; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 673; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 674; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 675; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 676; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 677; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 678; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 679; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 680; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 681; GFX11-CU-NEXT: s_endpgm 682; 683; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load: 684; GFX12-WGP: ; %bb.0: ; %entry 685; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 686; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 687; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 688; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 689; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 690; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 691; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 692; GFX12-WGP-NEXT: s_wait_storecnt 0x0 693; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 694; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 695; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 696; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 697; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 698; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 699; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 700; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 701; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 702; GFX12-WGP-NEXT: s_endpgm 703; 704; GFX12-CU-LABEL: flat_workgroup_seq_cst_load: 705; GFX12-CU: ; %bb.0: ; %entry 706; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 707; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 708; GFX12-CU-NEXT: s_wait_kmcnt 0x0 709; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 710; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 711; GFX12-CU-NEXT: s_wait_dscnt 0x0 712; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 713; GFX12-CU-NEXT: s_wait_dscnt 0x0 714; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 715; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 716; GFX12-CU-NEXT: s_wait_loadcnt 0x0 717; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 718; GFX12-CU-NEXT: s_endpgm 719 ptr %in, ptr %out) { 720entry: 721 %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 722 store i32 %val, ptr %out 723 ret void 724} 725 726define amdgpu_kernel void @flat_workgroup_unordered_store( 727; GFX7-LABEL: flat_workgroup_unordered_store: 728; GFX7: ; %bb.0: ; %entry 729; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 730; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 731; GFX7-NEXT: s_waitcnt lgkmcnt(0) 732; GFX7-NEXT: v_mov_b32_e32 v0, s6 733; GFX7-NEXT: v_mov_b32_e32 v1, s7 734; GFX7-NEXT: v_mov_b32_e32 v2, s4 735; GFX7-NEXT: flat_store_dword v[0:1], v2 736; GFX7-NEXT: s_endpgm 737; 738; GFX10-WGP-LABEL: flat_workgroup_unordered_store: 739; GFX10-WGP: ; %bb.0: ; %entry 740; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 741; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 742; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 743; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 744; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 745; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 746; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 747; GFX10-WGP-NEXT: s_endpgm 748; 749; GFX10-CU-LABEL: flat_workgroup_unordered_store: 750; GFX10-CU: ; %bb.0: ; %entry 751; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 752; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 753; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 754; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 755; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 756; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 757; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 758; GFX10-CU-NEXT: s_endpgm 759; 760; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store: 761; SKIP-CACHE-INV: ; %bb.0: ; %entry 762; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 763; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 764; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 767; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 768; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 769; SKIP-CACHE-INV-NEXT: s_endpgm 770; 771; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: 772; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 773; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 774; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 775; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 776; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 777; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 778; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 779; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 780; 781; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: 782; GFX90A-TGSPLIT: ; %bb.0: ; %entry 783; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 784; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 785; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 786; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 787; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 788; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 789; GFX90A-TGSPLIT-NEXT: s_endpgm 790; 791; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: 792; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 793; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 794; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 795; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 796; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 797; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 798; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 799; GFX940-NOTTGSPLIT-NEXT: s_endpgm 800; 801; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store: 802; GFX940-TGSPLIT: ; %bb.0: ; %entry 803; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 804; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 805; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 806; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 807; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 808; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 809; GFX940-TGSPLIT-NEXT: s_endpgm 810; 811; GFX11-WGP-LABEL: flat_workgroup_unordered_store: 812; GFX11-WGP: ; %bb.0: ; %entry 813; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 814; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 815; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 816; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 817; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 818; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 819; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 820; GFX11-WGP-NEXT: s_endpgm 821; 822; GFX11-CU-LABEL: flat_workgroup_unordered_store: 823; GFX11-CU: ; %bb.0: ; %entry 824; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 825; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 826; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 827; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 828; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 829; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 830; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 831; GFX11-CU-NEXT: s_endpgm 832; 833; GFX12-WGP-LABEL: flat_workgroup_unordered_store: 834; GFX12-WGP: ; %bb.0: ; %entry 835; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 836; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 837; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 838; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 839; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 840; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 841; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 842; GFX12-WGP-NEXT: s_endpgm 843; 844; GFX12-CU-LABEL: flat_workgroup_unordered_store: 845; GFX12-CU: ; %bb.0: ; %entry 846; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 847; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 848; GFX12-CU-NEXT: s_wait_kmcnt 0x0 849; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 850; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 851; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 852; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 853; GFX12-CU-NEXT: s_endpgm 854 i32 %in, ptr %out) { 855entry: 856 store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 857 ret void 858} 859 860define amdgpu_kernel void @flat_workgroup_monotonic_store( 861; GFX7-LABEL: flat_workgroup_monotonic_store: 862; GFX7: ; %bb.0: ; %entry 863; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 864; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 865; GFX7-NEXT: s_waitcnt lgkmcnt(0) 866; GFX7-NEXT: v_mov_b32_e32 v0, s6 867; GFX7-NEXT: v_mov_b32_e32 v1, s7 868; GFX7-NEXT: v_mov_b32_e32 v2, s4 869; GFX7-NEXT: flat_store_dword v[0:1], v2 870; GFX7-NEXT: s_endpgm 871; 872; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: 873; GFX10-WGP: ; %bb.0: ; %entry 874; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 875; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 876; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 877; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 878; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 879; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 880; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 881; GFX10-WGP-NEXT: s_endpgm 882; 883; GFX10-CU-LABEL: flat_workgroup_monotonic_store: 884; GFX10-CU: ; %bb.0: ; %entry 885; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 886; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 887; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 888; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 889; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 890; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 891; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 892; GFX10-CU-NEXT: s_endpgm 893; 894; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store: 895; SKIP-CACHE-INV: ; %bb.0: ; %entry 896; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 897; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 898; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 899; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 900; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 901; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 902; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 903; SKIP-CACHE-INV-NEXT: s_endpgm 904; 905; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: 906; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 907; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 908; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 909; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 910; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 911; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 912; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 913; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 914; 915; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: 916; GFX90A-TGSPLIT: ; %bb.0: ; %entry 917; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 918; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 919; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 920; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 921; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 922; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 923; GFX90A-TGSPLIT-NEXT: s_endpgm 924; 925; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: 926; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 927; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 928; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 929; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 930; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 931; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 932; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 933; GFX940-NOTTGSPLIT-NEXT: s_endpgm 934; 935; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store: 936; GFX940-TGSPLIT: ; %bb.0: ; %entry 937; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 938; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 939; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 940; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 941; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 942; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 943; GFX940-TGSPLIT-NEXT: s_endpgm 944; 945; GFX11-WGP-LABEL: flat_workgroup_monotonic_store: 946; GFX11-WGP: ; %bb.0: ; %entry 947; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 948; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 949; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 950; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 951; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 952; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 953; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 954; GFX11-WGP-NEXT: s_endpgm 955; 956; GFX11-CU-LABEL: flat_workgroup_monotonic_store: 957; GFX11-CU: ; %bb.0: ; %entry 958; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 959; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 960; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 961; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 962; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 963; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 964; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 965; GFX11-CU-NEXT: s_endpgm 966; 967; GFX12-WGP-LABEL: flat_workgroup_monotonic_store: 968; GFX12-WGP: ; %bb.0: ; %entry 969; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 970; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 971; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 972; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 973; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 974; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 975; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 976; GFX12-WGP-NEXT: s_endpgm 977; 978; GFX12-CU-LABEL: flat_workgroup_monotonic_store: 979; GFX12-CU: ; %bb.0: ; %entry 980; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 981; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 982; GFX12-CU-NEXT: s_wait_kmcnt 0x0 983; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 984; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 985; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 986; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 987; GFX12-CU-NEXT: s_endpgm 988 i32 %in, ptr %out) { 989entry: 990 store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 991 ret void 992} 993 994define amdgpu_kernel void @flat_workgroup_release_store( 995; GFX7-LABEL: flat_workgroup_release_store: 996; GFX7: ; %bb.0: ; %entry 997; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 998; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 999; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX7-NEXT: v_mov_b32_e32 v0, s6 1001; GFX7-NEXT: v_mov_b32_e32 v1, s7 1002; GFX7-NEXT: v_mov_b32_e32 v2, s4 1003; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX7-NEXT: flat_store_dword v[0:1], v2 1005; GFX7-NEXT: s_endpgm 1006; 1007; GFX10-WGP-LABEL: flat_workgroup_release_store: 1008; GFX10-WGP: ; %bb.0: ; %entry 1009; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 1010; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1011; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1013; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1014; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1015; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1016; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1017; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1018; GFX10-WGP-NEXT: s_endpgm 1019; 1020; GFX10-CU-LABEL: flat_workgroup_release_store: 1021; GFX10-CU: ; %bb.0: ; %entry 1022; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 1023; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1024; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1026; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1027; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1028; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1029; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1030; GFX10-CU-NEXT: s_endpgm 1031; 1032; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store: 1033; SKIP-CACHE-INV: ; %bb.0: ; %entry 1034; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 1035; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1036; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1037; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1038; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1039; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1040; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1041; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1042; SKIP-CACHE-INV-NEXT: s_endpgm 1043; 1044; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: 1045; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1046; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1047; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1048; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1050; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1051; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1053; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1054; 1055; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: 1056; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1057; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1058; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1059; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1061; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1062; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1063; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1064; GFX90A-TGSPLIT-NEXT: s_endpgm 1065; 1066; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store: 1067; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1068; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1069; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1070; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1072; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1073; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1074; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1075; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1076; 1077; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store: 1078; GFX940-TGSPLIT: ; %bb.0: ; %entry 1079; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1080; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1081; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1083; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1084; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1085; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1086; GFX940-TGSPLIT-NEXT: s_endpgm 1087; 1088; GFX11-WGP-LABEL: flat_workgroup_release_store: 1089; GFX11-WGP: ; %bb.0: ; %entry 1090; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1091; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1092; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1094; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1095; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1096; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1097; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1098; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1099; GFX11-WGP-NEXT: s_endpgm 1100; 1101; GFX11-CU-LABEL: flat_workgroup_release_store: 1102; GFX11-CU: ; %bb.0: ; %entry 1103; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1104; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1105; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1107; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1108; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1109; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1111; GFX11-CU-NEXT: s_endpgm 1112; 1113; GFX12-WGP-LABEL: flat_workgroup_release_store: 1114; GFX12-WGP: ; %bb.0: ; %entry 1115; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1116; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1117; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1118; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1119; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1120; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1121; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 1122; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 1123; GFX12-WGP-NEXT: s_wait_storecnt 0x0 1124; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 1125; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 1126; GFX12-WGP-NEXT: s_endpgm 1127; 1128; GFX12-CU-LABEL: flat_workgroup_release_store: 1129; GFX12-CU: ; %bb.0: ; %entry 1130; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1131; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1132; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1133; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1134; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1135; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1136; GFX12-CU-NEXT: s_wait_dscnt 0x0 1137; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 1138; GFX12-CU-NEXT: s_endpgm 1139 i32 %in, ptr %out) { 1140entry: 1141 store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 1142 ret void 1143} 1144 1145define amdgpu_kernel void @flat_workgroup_seq_cst_store( 1146; GFX7-LABEL: flat_workgroup_seq_cst_store: 1147; GFX7: ; %bb.0: ; %entry 1148; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 1149; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 1150; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX7-NEXT: v_mov_b32_e32 v0, s6 1152; GFX7-NEXT: v_mov_b32_e32 v1, s7 1153; GFX7-NEXT: v_mov_b32_e32 v2, s4 1154; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX7-NEXT: flat_store_dword v[0:1], v2 1156; GFX7-NEXT: s_endpgm 1157; 1158; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: 1159; GFX10-WGP: ; %bb.0: ; %entry 1160; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 1161; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1162; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1164; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1165; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1166; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1167; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1168; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1169; GFX10-WGP-NEXT: s_endpgm 1170; 1171; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: 1172; GFX10-CU: ; %bb.0: ; %entry 1173; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 1174; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1175; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1176; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1177; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1178; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1179; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1181; GFX10-CU-NEXT: s_endpgm 1182; 1183; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store: 1184; SKIP-CACHE-INV: ; %bb.0: ; %entry 1185; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 1186; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1187; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1188; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1189; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1190; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1191; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1192; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1193; SKIP-CACHE-INV-NEXT: s_endpgm 1194; 1195; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: 1196; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1197; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1198; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1199; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1201; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1202; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1204; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1205; 1206; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: 1207; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1208; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1209; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1210; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1211; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1212; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1213; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1214; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1215; GFX90A-TGSPLIT-NEXT: s_endpgm 1216; 1217; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: 1218; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1219; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1220; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1221; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1223; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1224; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1226; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1227; 1228; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: 1229; GFX940-TGSPLIT: ; %bb.0: ; %entry 1230; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1231; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1232; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1234; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1235; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1236; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1237; GFX940-TGSPLIT-NEXT: s_endpgm 1238; 1239; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store: 1240; GFX11-WGP: ; %bb.0: ; %entry 1241; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1242; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1243; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1245; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1246; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1247; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1248; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1249; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1250; GFX11-WGP-NEXT: s_endpgm 1251; 1252; GFX11-CU-LABEL: flat_workgroup_seq_cst_store: 1253; GFX11-CU: ; %bb.0: ; %entry 1254; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1255; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1256; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1258; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1259; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1260; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1262; GFX11-CU-NEXT: s_endpgm 1263; 1264; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store: 1265; GFX12-WGP: ; %bb.0: ; %entry 1266; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1267; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1268; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1269; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1270; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1271; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1272; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 1273; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 1274; GFX12-WGP-NEXT: s_wait_storecnt 0x0 1275; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 1276; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 1277; GFX12-WGP-NEXT: s_endpgm 1278; 1279; GFX12-CU-LABEL: flat_workgroup_seq_cst_store: 1280; GFX12-CU: ; %bb.0: ; %entry 1281; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1282; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1283; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1284; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1285; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1286; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1287; GFX12-CU-NEXT: s_wait_dscnt 0x0 1288; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 1289; GFX12-CU-NEXT: s_endpgm 1290 i32 %in, ptr %out) { 1291entry: 1292 store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 1293 ret void 1294} 1295 1296define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( 1297; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: 1298; GFX7: ; %bb.0: ; %entry 1299; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1300; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1301; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1302; GFX7-NEXT: v_mov_b32_e32 v0, s6 1303; GFX7-NEXT: v_mov_b32_e32 v1, s7 1304; GFX7-NEXT: v_mov_b32_e32 v2, s4 1305; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1306; GFX7-NEXT: s_endpgm 1307; 1308; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: 1309; GFX10-WGP: ; %bb.0: ; %entry 1310; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1311; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1312; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1313; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1314; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1315; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1316; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1317; GFX10-WGP-NEXT: s_endpgm 1318; 1319; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: 1320; GFX10-CU: ; %bb.0: ; %entry 1321; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1322; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1323; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1324; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1325; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1326; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1327; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1328; GFX10-CU-NEXT: s_endpgm 1329; 1330; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw: 1331; SKIP-CACHE-INV: ; %bb.0: ; %entry 1332; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1333; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1334; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1335; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1336; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1337; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1338; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1339; SKIP-CACHE-INV-NEXT: s_endpgm 1340; 1341; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: 1342; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1343; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1344; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1345; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1346; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1347; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1348; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1349; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1350; 1351; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: 1352; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1353; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1354; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1355; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1357; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1358; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1359; GFX90A-TGSPLIT-NEXT: s_endpgm 1360; 1361; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: 1362; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1363; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1364; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1365; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1367; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1368; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1369; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1370; 1371; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: 1372; GFX940-TGSPLIT: ; %bb.0: ; %entry 1373; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1374; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1375; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1377; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1378; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1379; GFX940-TGSPLIT-NEXT: s_endpgm 1380; 1381; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: 1382; GFX11-WGP: ; %bb.0: ; %entry 1383; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1384; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1385; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1387; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1388; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1389; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1390; GFX11-WGP-NEXT: s_endpgm 1391; 1392; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw: 1393; GFX11-CU: ; %bb.0: ; %entry 1394; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1395; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1396; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1398; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1399; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1400; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1401; GFX11-CU-NEXT: s_endpgm 1402; 1403; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: 1404; GFX12-WGP: ; %bb.0: ; %entry 1405; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1406; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1407; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1408; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1409; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1410; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1411; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 1412; GFX12-WGP-NEXT: s_endpgm 1413; 1414; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: 1415; GFX12-CU: ; %bb.0: ; %entry 1416; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1417; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1418; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1419; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1420; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1421; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1422; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1423; GFX12-CU-NEXT: s_endpgm 1424 ptr %out, i32 %in) { 1425entry: 1426 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic 1427 ret void 1428} 1429 1430define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( 1431; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: 1432; GFX7: ; %bb.0: ; %entry 1433; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1434; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1435; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1436; GFX7-NEXT: v_mov_b32_e32 v0, s6 1437; GFX7-NEXT: v_mov_b32_e32 v1, s7 1438; GFX7-NEXT: v_mov_b32_e32 v2, s4 1439; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1440; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX7-NEXT: s_endpgm 1442; 1443; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: 1444; GFX10-WGP: ; %bb.0: ; %entry 1445; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1446; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1447; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1448; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1449; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1450; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1451; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1452; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1454; GFX10-WGP-NEXT: buffer_gl0_inv 1455; GFX10-WGP-NEXT: s_endpgm 1456; 1457; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: 1458; GFX10-CU: ; %bb.0: ; %entry 1459; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1460; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1461; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1462; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1463; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1464; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1465; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1466; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1467; GFX10-CU-NEXT: s_endpgm 1468; 1469; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw: 1470; SKIP-CACHE-INV: ; %bb.0: ; %entry 1471; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1472; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1473; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1475; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1476; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1477; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1478; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1479; SKIP-CACHE-INV-NEXT: s_endpgm 1480; 1481; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: 1482; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1483; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1484; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1485; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1487; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1488; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1489; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1491; 1492; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: 1493; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1494; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1495; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1496; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1498; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1499; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1500; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1501; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1502; GFX90A-TGSPLIT-NEXT: s_endpgm 1503; 1504; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: 1505; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1506; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1507; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1508; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1510; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1511; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1512; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1514; 1515; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: 1516; GFX940-TGSPLIT: ; %bb.0: ; %entry 1517; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1518; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1519; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1520; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1521; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1522; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1523; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1524; GFX940-TGSPLIT-NEXT: buffer_inv sc0 1525; GFX940-TGSPLIT-NEXT: s_endpgm 1526; 1527; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw: 1528; GFX11-WGP: ; %bb.0: ; %entry 1529; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1530; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1531; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1532; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1533; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1534; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1535; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1536; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1538; GFX11-WGP-NEXT: buffer_gl0_inv 1539; GFX11-WGP-NEXT: s_endpgm 1540; 1541; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw: 1542; GFX11-CU: ; %bb.0: ; %entry 1543; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1544; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1545; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1546; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1547; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1548; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1549; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1550; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1551; GFX11-CU-NEXT: s_endpgm 1552; 1553; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw: 1554; GFX12-WGP: ; %bb.0: ; %entry 1555; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1556; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1557; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1558; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1559; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1560; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1561; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 1562; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 1563; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 1564; GFX12-WGP-NEXT: s_endpgm 1565; 1566; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw: 1567; GFX12-CU: ; %bb.0: ; %entry 1568; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1569; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1570; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1571; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1572; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1573; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1574; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1575; GFX12-CU-NEXT: s_wait_dscnt 0x0 1576; GFX12-CU-NEXT: s_endpgm 1577 ptr %out, i32 %in) { 1578entry: 1579 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire 1580 ret void 1581} 1582 1583define amdgpu_kernel void @flat_workgroup_release_atomicrmw( 1584; GFX7-LABEL: flat_workgroup_release_atomicrmw: 1585; GFX7: ; %bb.0: ; %entry 1586; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1587; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1588; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1589; GFX7-NEXT: v_mov_b32_e32 v0, s6 1590; GFX7-NEXT: v_mov_b32_e32 v1, s7 1591; GFX7-NEXT: v_mov_b32_e32 v2, s4 1592; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1594; GFX7-NEXT: s_endpgm 1595; 1596; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: 1597; GFX10-WGP: ; %bb.0: ; %entry 1598; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1599; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1600; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1602; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1603; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1604; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1605; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1606; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1607; GFX10-WGP-NEXT: s_endpgm 1608; 1609; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: 1610; GFX10-CU: ; %bb.0: ; %entry 1611; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1612; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1613; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1615; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1616; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1617; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1618; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1619; GFX10-CU-NEXT: s_endpgm 1620; 1621; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw: 1622; SKIP-CACHE-INV: ; %bb.0: ; %entry 1623; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1624; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1625; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1626; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1627; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1628; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1629; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1630; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1631; SKIP-CACHE-INV-NEXT: s_endpgm 1632; 1633; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: 1634; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1635; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1636; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1637; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1638; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1639; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1640; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1641; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1642; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1643; 1644; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: 1645; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1646; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1647; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1648; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1650; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1651; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1652; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1653; GFX90A-TGSPLIT-NEXT: s_endpgm 1654; 1655; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: 1656; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1657; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1658; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1659; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1660; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1661; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1662; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1663; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1664; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1665; 1666; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: 1667; GFX940-TGSPLIT: ; %bb.0: ; %entry 1668; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1669; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1670; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1672; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1673; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1674; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1675; GFX940-TGSPLIT-NEXT: s_endpgm 1676; 1677; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw: 1678; GFX11-WGP: ; %bb.0: ; %entry 1679; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1680; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1681; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1682; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1683; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1684; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1685; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1686; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1687; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1688; GFX11-WGP-NEXT: s_endpgm 1689; 1690; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw: 1691; GFX11-CU: ; %bb.0: ; %entry 1692; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1693; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1694; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1696; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1697; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1698; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1700; GFX11-CU-NEXT: s_endpgm 1701; 1702; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw: 1703; GFX12-WGP: ; %bb.0: ; %entry 1704; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1705; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1706; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1707; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1708; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1709; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1710; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 1711; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 1712; GFX12-WGP-NEXT: s_wait_storecnt 0x0 1713; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 1714; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 1715; GFX12-WGP-NEXT: s_endpgm 1716; 1717; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: 1718; GFX12-CU: ; %bb.0: ; %entry 1719; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1720; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1721; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1722; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1723; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1724; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1725; GFX12-CU-NEXT: s_wait_dscnt 0x0 1726; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1727; GFX12-CU-NEXT: s_endpgm 1728 ptr %out, i32 %in) { 1729entry: 1730 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release 1731 ret void 1732} 1733 1734define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( 1735; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: 1736; GFX7: ; %bb.0: ; %entry 1737; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1738; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1739; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX7-NEXT: v_mov_b32_e32 v0, s6 1741; GFX7-NEXT: v_mov_b32_e32 v1, s7 1742; GFX7-NEXT: v_mov_b32_e32 v2, s4 1743; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1744; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1745; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1746; GFX7-NEXT: s_endpgm 1747; 1748; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: 1749; GFX10-WGP: ; %bb.0: ; %entry 1750; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1751; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1752; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1754; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1755; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1756; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1757; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1758; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1759; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1760; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1761; GFX10-WGP-NEXT: buffer_gl0_inv 1762; GFX10-WGP-NEXT: s_endpgm 1763; 1764; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: 1765; GFX10-CU: ; %bb.0: ; %entry 1766; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1767; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1768; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1769; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1770; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1771; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1772; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1774; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1775; GFX10-CU-NEXT: s_endpgm 1776; 1777; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw: 1778; SKIP-CACHE-INV: ; %bb.0: ; %entry 1779; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1780; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1781; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1784; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1785; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1786; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1787; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1788; SKIP-CACHE-INV-NEXT: s_endpgm 1789; 1790; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: 1791; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1792; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1793; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1794; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1795; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1796; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1797; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1799; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1801; 1802; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: 1803; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1804; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1805; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1806; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1807; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1808; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1809; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1810; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1811; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1812; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1813; GFX90A-TGSPLIT-NEXT: s_endpgm 1814; 1815; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: 1816; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1817; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1818; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1819; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1821; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1822; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1823; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1824; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1826; 1827; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: 1828; GFX940-TGSPLIT: ; %bb.0: ; %entry 1829; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1830; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1831; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1832; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1833; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1834; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1835; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1836; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1837; GFX940-TGSPLIT-NEXT: buffer_inv sc0 1838; GFX940-TGSPLIT-NEXT: s_endpgm 1839; 1840; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: 1841; GFX11-WGP: ; %bb.0: ; %entry 1842; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1843; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1844; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1846; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1847; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1848; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1849; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1850; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1851; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1852; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1853; GFX11-WGP-NEXT: buffer_gl0_inv 1854; GFX11-WGP-NEXT: s_endpgm 1855; 1856; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: 1857; GFX11-CU: ; %bb.0: ; %entry 1858; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1859; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1860; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1862; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1863; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1864; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1866; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX11-CU-NEXT: s_endpgm 1868; 1869; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: 1870; GFX12-WGP: ; %bb.0: ; %entry 1871; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1872; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1873; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1874; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1875; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1876; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1877; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 1878; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 1879; GFX12-WGP-NEXT: s_wait_storecnt 0x0 1880; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 1881; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 1882; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 1883; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 1884; GFX12-WGP-NEXT: s_endpgm 1885; 1886; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: 1887; GFX12-CU: ; %bb.0: ; %entry 1888; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1889; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1890; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1891; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1892; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1893; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1894; GFX12-CU-NEXT: s_wait_dscnt 0x0 1895; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1896; GFX12-CU-NEXT: s_wait_dscnt 0x0 1897; GFX12-CU-NEXT: s_endpgm 1898 ptr %out, i32 %in) { 1899entry: 1900 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel 1901 ret void 1902} 1903 1904define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( 1905; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: 1906; GFX7: ; %bb.0: ; %entry 1907; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1908; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1909; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1910; GFX7-NEXT: v_mov_b32_e32 v0, s6 1911; GFX7-NEXT: v_mov_b32_e32 v1, s7 1912; GFX7-NEXT: v_mov_b32_e32 v2, s4 1913; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1915; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX7-NEXT: s_endpgm 1917; 1918; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: 1919; GFX10-WGP: ; %bb.0: ; %entry 1920; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1921; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1922; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1923; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1924; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1925; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1926; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1927; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1928; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1929; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1930; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1931; GFX10-WGP-NEXT: buffer_gl0_inv 1932; GFX10-WGP-NEXT: s_endpgm 1933; 1934; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: 1935; GFX10-CU: ; %bb.0: ; %entry 1936; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1937; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1938; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1939; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1940; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1941; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1942; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1944; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1945; GFX10-CU-NEXT: s_endpgm 1946; 1947; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw: 1948; SKIP-CACHE-INV: ; %bb.0: ; %entry 1949; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1950; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1951; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1952; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1953; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1954; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1955; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1956; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1957; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1958; SKIP-CACHE-INV-NEXT: s_endpgm 1959; 1960; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: 1961; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1962; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1963; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1964; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1966; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1967; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1969; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1971; 1972; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: 1973; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1974; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1975; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1976; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1977; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1978; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1979; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1980; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1981; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1982; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 1983; GFX90A-TGSPLIT-NEXT: s_endpgm 1984; 1985; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: 1986; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1987; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1988; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1989; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1991; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1992; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1994; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1995; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1996; 1997; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: 1998; GFX940-TGSPLIT: ; %bb.0: ; %entry 1999; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2000; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 2001; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2002; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 2003; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 2004; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2005; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 2006; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2007; GFX940-TGSPLIT-NEXT: buffer_inv sc0 2008; GFX940-TGSPLIT-NEXT: s_endpgm 2009; 2010; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: 2011; GFX11-WGP: ; %bb.0: ; %entry 2012; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 2013; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 2014; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2015; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 2016; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 2017; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 2018; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2019; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2020; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 2021; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2022; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2023; GFX11-WGP-NEXT: buffer_gl0_inv 2024; GFX11-WGP-NEXT: s_endpgm 2025; 2026; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: 2027; GFX11-CU: ; %bb.0: ; %entry 2028; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 2029; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 2030; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2031; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 2032; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 2033; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 2034; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 2036; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX11-CU-NEXT: s_endpgm 2038; 2039; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: 2040; GFX12-WGP: ; %bb.0: ; %entry 2041; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 2042; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 2043; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2044; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 2045; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 2046; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 2047; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 2048; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 2049; GFX12-WGP-NEXT: s_wait_storecnt 0x0 2050; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2051; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 2052; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 2053; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 2054; GFX12-WGP-NEXT: s_endpgm 2055; 2056; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: 2057; GFX12-CU: ; %bb.0: ; %entry 2058; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 2059; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 2060; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2061; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 2062; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 2063; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 2064; GFX12-CU-NEXT: s_wait_dscnt 0x0 2065; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 2066; GFX12-CU-NEXT: s_wait_dscnt 0x0 2067; GFX12-CU-NEXT: s_endpgm 2068 ptr %out, i32 %in) { 2069entry: 2070 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst 2071 ret void 2072} 2073 2074define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( 2075; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2076; GFX7: ; %bb.0: ; %entry 2077; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2078; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 2079; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2080; GFX7-NEXT: v_mov_b32_e32 v0, s4 2081; GFX7-NEXT: v_mov_b32_e32 v1, s5 2082; GFX7-NEXT: v_mov_b32_e32 v2, s6 2083; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2084; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX7-NEXT: v_mov_b32_e32 v0, s4 2086; GFX7-NEXT: v_mov_b32_e32 v1, s5 2087; GFX7-NEXT: s_waitcnt vmcnt(0) 2088; GFX7-NEXT: flat_store_dword v[0:1], v2 2089; GFX7-NEXT: s_endpgm 2090; 2091; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2092; GFX10-WGP: ; %bb.0: ; %entry 2093; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2094; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 2095; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2097; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2098; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 2099; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2100; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2101; GFX10-WGP-NEXT: buffer_gl0_inv 2102; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2103; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2104; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2105; GFX10-WGP-NEXT: s_endpgm 2106; 2107; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2108; GFX10-CU: ; %bb.0: ; %entry 2109; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2110; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 2111; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2112; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2113; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2114; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 2115; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2116; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2117; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2118; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2119; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2120; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2121; GFX10-CU-NEXT: s_endpgm 2122; 2123; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2124; SKIP-CACHE-INV: ; %bb.0: ; %entry 2125; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2126; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 2127; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2128; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2129; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2130; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2131; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2132; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2133; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2134; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2135; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2136; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2137; SKIP-CACHE-INV-NEXT: s_endpgm 2138; 2139; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2140; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2141; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2142; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2143; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2144; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2145; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2146; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2147; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2149; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2150; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2151; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2152; 2153; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2154; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2155; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2156; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2157; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2158; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2159; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2160; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2161; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2162; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2163; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2164; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2165; GFX90A-TGSPLIT-NEXT: s_endpgm 2166; 2167; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2168; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2169; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2170; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2171; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2172; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2173; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2174; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2175; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2177; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2178; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2179; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2180; 2181; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2182; GFX940-TGSPLIT: ; %bb.0: ; %entry 2183; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2184; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2185; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2186; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2187; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2188; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2189; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2190; GFX940-TGSPLIT-NEXT: buffer_inv sc0 2191; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2192; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2193; GFX940-TGSPLIT-NEXT: s_endpgm 2194; 2195; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2196; GFX11-WGP: ; %bb.0: ; %entry 2197; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2198; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2199; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2200; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2201; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2202; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2203; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2204; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2205; GFX11-WGP-NEXT: buffer_gl0_inv 2206; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2207; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2208; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2209; GFX11-WGP-NEXT: s_endpgm 2210; 2211; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2212; GFX11-CU: ; %bb.0: ; %entry 2213; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2214; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2215; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2217; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2218; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2219; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2220; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2222; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2223; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 2224; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2225; GFX11-CU-NEXT: s_endpgm 2226; 2227; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2228; GFX12-WGP: ; %bb.0: ; %entry 2229; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2230; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2231; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2232; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2233; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2234; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2235; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 2236; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2237; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 2238; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2239; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2240; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2241; GFX12-WGP-NEXT: s_endpgm 2242; 2243; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: 2244; GFX12-CU: ; %bb.0: ; %entry 2245; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2246; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2247; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2248; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2249; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2250; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2251; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2252; GFX12-CU-NEXT: s_wait_dscnt 0x0 2253; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2254; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2255; GFX12-CU-NEXT: s_wait_loadcnt 0x0 2256; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2257; GFX12-CU-NEXT: s_endpgm 2258 ptr %out, i32 %in) { 2259entry: 2260 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire 2261 store i32 %val, ptr %out, align 4 2262 ret void 2263} 2264 2265define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( 2266; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2267; GFX7: ; %bb.0: ; %entry 2268; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2269; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 2270; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2271; GFX7-NEXT: v_mov_b32_e32 v0, s4 2272; GFX7-NEXT: v_mov_b32_e32 v1, s5 2273; GFX7-NEXT: v_mov_b32_e32 v2, s6 2274; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2275; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2276; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2277; GFX7-NEXT: v_mov_b32_e32 v0, s4 2278; GFX7-NEXT: v_mov_b32_e32 v1, s5 2279; GFX7-NEXT: s_waitcnt vmcnt(0) 2280; GFX7-NEXT: flat_store_dword v[0:1], v2 2281; GFX7-NEXT: s_endpgm 2282; 2283; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2284; GFX10-WGP: ; %bb.0: ; %entry 2285; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2286; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 2287; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2288; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2289; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2290; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 2291; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2292; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2293; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2294; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2295; GFX10-WGP-NEXT: buffer_gl0_inv 2296; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2297; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2298; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2299; GFX10-WGP-NEXT: s_endpgm 2300; 2301; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2302; GFX10-CU: ; %bb.0: ; %entry 2303; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2304; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 2305; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2306; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2307; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2308; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 2309; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2310; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2311; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2312; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2313; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2314; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2315; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2316; GFX10-CU-NEXT: s_endpgm 2317; 2318; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2319; SKIP-CACHE-INV: ; %bb.0: ; %entry 2320; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2321; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 2322; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2323; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2324; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2325; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2326; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2327; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2328; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2329; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2330; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2331; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2332; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2333; SKIP-CACHE-INV-NEXT: s_endpgm 2334; 2335; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2336; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2337; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2338; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2339; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2340; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2341; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2342; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2343; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2344; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2346; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2347; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2348; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2349; 2350; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2351; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2352; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2353; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2354; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2355; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2356; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2357; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2358; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2359; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2360; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2361; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2362; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2363; GFX90A-TGSPLIT-NEXT: s_endpgm 2364; 2365; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2366; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2367; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2368; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2369; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2370; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2371; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2372; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2373; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2374; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2375; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2376; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2377; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2378; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2379; 2380; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2381; GFX940-TGSPLIT: ; %bb.0: ; %entry 2382; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2383; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2384; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2385; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2386; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2387; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2388; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2389; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2390; GFX940-TGSPLIT-NEXT: buffer_inv sc0 2391; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2392; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2393; GFX940-TGSPLIT-NEXT: s_endpgm 2394; 2395; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2396; GFX11-WGP: ; %bb.0: ; %entry 2397; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2398; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2399; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2401; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2402; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2403; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2404; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2405; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2406; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2407; GFX11-WGP-NEXT: buffer_gl0_inv 2408; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2409; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2410; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2411; GFX11-WGP-NEXT: s_endpgm 2412; 2413; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2414; GFX11-CU: ; %bb.0: ; %entry 2415; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2416; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2417; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2418; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2419; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2420; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2421; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2422; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2423; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2424; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2425; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2426; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 2427; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2428; GFX11-CU-NEXT: s_endpgm 2429; 2430; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2431; GFX12-WGP: ; %bb.0: ; %entry 2432; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2433; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2434; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2435; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2436; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2437; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2438; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 2439; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 2440; GFX12-WGP-NEXT: s_wait_storecnt 0x0 2441; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2442; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 2443; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 2444; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 2445; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2446; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 2447; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2448; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2449; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2450; GFX12-WGP-NEXT: s_endpgm 2451; 2452; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: 2453; GFX12-CU: ; %bb.0: ; %entry 2454; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2455; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2456; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2457; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2458; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2459; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2460; GFX12-CU-NEXT: s_wait_dscnt 0x0 2461; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2462; GFX12-CU-NEXT: s_wait_dscnt 0x0 2463; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2464; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2465; GFX12-CU-NEXT: s_wait_loadcnt 0x0 2466; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2467; GFX12-CU-NEXT: s_endpgm 2468 ptr %out, i32 %in) { 2469entry: 2470 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel 2471 store i32 %val, ptr %out, align 4 2472 ret void 2473} 2474 2475define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( 2476; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2477; GFX7: ; %bb.0: ; %entry 2478; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2479; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 2480; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX7-NEXT: v_mov_b32_e32 v0, s4 2482; GFX7-NEXT: v_mov_b32_e32 v1, s5 2483; GFX7-NEXT: v_mov_b32_e32 v2, s6 2484; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2485; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2486; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2487; GFX7-NEXT: v_mov_b32_e32 v0, s4 2488; GFX7-NEXT: v_mov_b32_e32 v1, s5 2489; GFX7-NEXT: s_waitcnt vmcnt(0) 2490; GFX7-NEXT: flat_store_dword v[0:1], v2 2491; GFX7-NEXT: s_endpgm 2492; 2493; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2494; GFX10-WGP: ; %bb.0: ; %entry 2495; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2496; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 2497; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2499; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2500; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 2501; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2502; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2503; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2504; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2505; GFX10-WGP-NEXT: buffer_gl0_inv 2506; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2507; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2508; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2509; GFX10-WGP-NEXT: s_endpgm 2510; 2511; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2512; GFX10-CU: ; %bb.0: ; %entry 2513; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2514; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 2515; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2516; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2517; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2518; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 2519; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2520; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2521; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2522; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2523; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2524; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2525; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2526; GFX10-CU-NEXT: s_endpgm 2527; 2528; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2529; SKIP-CACHE-INV: ; %bb.0: ; %entry 2530; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2531; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 2532; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2533; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2534; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2536; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2537; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2538; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2541; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2542; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2543; SKIP-CACHE-INV-NEXT: s_endpgm 2544; 2545; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2546; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2547; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2548; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2549; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2550; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2551; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2552; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2554; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2555; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2556; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2557; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2558; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2559; 2560; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2561; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2562; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2563; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2564; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2565; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2566; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2567; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2568; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2569; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2570; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 2571; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2572; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2573; GFX90A-TGSPLIT-NEXT: s_endpgm 2574; 2575; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2576; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2577; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2578; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2579; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2580; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2581; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2582; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2583; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2584; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2585; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2586; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 2587; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2588; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2589; 2590; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2591; GFX940-TGSPLIT: ; %bb.0: ; %entry 2592; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2593; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2594; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2595; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2596; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2597; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2598; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2599; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2600; GFX940-TGSPLIT-NEXT: buffer_inv sc0 2601; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2602; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2603; GFX940-TGSPLIT-NEXT: s_endpgm 2604; 2605; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2606; GFX11-WGP: ; %bb.0: ; %entry 2607; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2608; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2609; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2610; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2611; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2612; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2613; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2614; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2615; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2616; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2617; GFX11-WGP-NEXT: buffer_gl0_inv 2618; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2619; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2620; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2621; GFX11-WGP-NEXT: s_endpgm 2622; 2623; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2624; GFX11-CU: ; %bb.0: ; %entry 2625; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2626; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2627; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2628; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2629; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2630; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2631; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2632; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2633; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2634; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2635; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2636; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 2637; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2638; GFX11-CU-NEXT: s_endpgm 2639; 2640; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2641; GFX12-WGP: ; %bb.0: ; %entry 2642; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2643; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2644; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2645; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2646; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2647; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2648; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 2649; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 2650; GFX12-WGP-NEXT: s_wait_storecnt 0x0 2651; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2652; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 2653; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 2654; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 2655; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2656; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 2657; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2658; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2659; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2660; GFX12-WGP-NEXT: s_endpgm 2661; 2662; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: 2663; GFX12-CU: ; %bb.0: ; %entry 2664; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2665; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2666; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2667; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2668; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2669; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2670; GFX12-CU-NEXT: s_wait_dscnt 0x0 2671; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2672; GFX12-CU-NEXT: s_wait_dscnt 0x0 2673; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2674; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2675; GFX12-CU-NEXT: s_wait_loadcnt 0x0 2676; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2677; GFX12-CU-NEXT: s_endpgm 2678 ptr %out, i32 %in) { 2679entry: 2680 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst 2681 store i32 %val, ptr %out, align 4 2682 ret void 2683} 2684 2685define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( 2686; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2687; GFX7: ; %bb.0: ; %entry 2688; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2689; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2690; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 2691; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 2692; GFX7-NEXT: s_mov_b64 s[10:11], 16 2693; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2694; GFX7-NEXT: s_mov_b32 s4, s8 2695; GFX7-NEXT: s_mov_b32 s5, s9 2696; GFX7-NEXT: s_mov_b32 s9, s10 2697; GFX7-NEXT: s_mov_b32 s8, s11 2698; GFX7-NEXT: s_add_u32 s4, s4, s9 2699; GFX7-NEXT: s_addc_u32 s8, s5, s8 2700; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2701; GFX7-NEXT: s_mov_b32 s5, s8 2702; GFX7-NEXT: v_mov_b32_e32 v2, s7 2703; GFX7-NEXT: v_mov_b32_e32 v0, s6 2704; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2705; GFX7-NEXT: v_mov_b32_e32 v3, v0 2706; GFX7-NEXT: v_mov_b32_e32 v0, s4 2707; GFX7-NEXT: v_mov_b32_e32 v1, s5 2708; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2709; GFX7-NEXT: s_endpgm 2710; 2711; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2712; GFX10-WGP: ; %bb.0: ; %entry 2713; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 2714; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2715; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 2716; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 2717; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 2718; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2719; GFX10-WGP-NEXT: s_mov_b32 s4, s8 2720; GFX10-WGP-NEXT: s_mov_b32 s5, s9 2721; GFX10-WGP-NEXT: s_mov_b32 s9, s10 2722; GFX10-WGP-NEXT: s_mov_b32 s8, s11 2723; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 2724; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 2725; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2726; GFX10-WGP-NEXT: s_mov_b32 s5, s8 2727; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 2728; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 2729; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2730; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 2731; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2732; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2733; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2734; GFX10-WGP-NEXT: s_endpgm 2735; 2736; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2737; GFX10-CU: ; %bb.0: ; %entry 2738; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 2739; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2740; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 2741; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 2742; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 2743; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2744; GFX10-CU-NEXT: s_mov_b32 s4, s8 2745; GFX10-CU-NEXT: s_mov_b32 s5, s9 2746; GFX10-CU-NEXT: s_mov_b32 s9, s10 2747; GFX10-CU-NEXT: s_mov_b32 s8, s11 2748; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 2749; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 2750; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2751; GFX10-CU-NEXT: s_mov_b32 s5, s8 2752; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 2753; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 2754; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2755; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 2756; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2757; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2758; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2759; GFX10-CU-NEXT: s_endpgm 2760; 2761; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2762; SKIP-CACHE-INV: ; %bb.0: ; %entry 2763; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 2764; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 2765; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 2766; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 2767; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 2768; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2769; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2770; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2771; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 2772; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 2773; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 2774; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 2775; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 2776; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 2777; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2778; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2779; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2780; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 2781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2783; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2784; SKIP-CACHE-INV-NEXT: s_endpgm 2785; 2786; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2787; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2788; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2789; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2790; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2791; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2792; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2793; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2794; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2795; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2796; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2797; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2798; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2799; 2800; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2801; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2802; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2803; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2804; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2805; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2807; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2808; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2809; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2810; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2811; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2812; GFX90A-TGSPLIT-NEXT: s_endpgm 2813; 2814; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2815; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2816; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2817; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2818; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2819; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2821; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2822; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2823; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2824; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2825; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2826; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2827; 2828; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2829; GFX940-TGSPLIT: ; %bb.0: ; %entry 2830; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2831; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2832; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2833; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2834; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2835; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2836; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2837; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2838; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2839; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2840; GFX940-TGSPLIT-NEXT: s_endpgm 2841; 2842; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2843; GFX11-WGP: ; %bb.0: ; %entry 2844; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2845; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2846; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2847; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2848; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 2849; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 2850; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2851; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 2852; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2853; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2854; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2855; GFX11-WGP-NEXT: s_endpgm 2856; 2857; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2858; GFX11-CU: ; %bb.0: ; %entry 2859; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2860; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2861; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2862; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 2864; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 2865; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2866; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 2867; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2868; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2869; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2870; GFX11-CU-NEXT: s_endpgm 2871; 2872; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2873; GFX12-WGP: ; %bb.0: ; %entry 2874; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2875; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2876; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2877; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2878; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 2879; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 2880; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2881; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 2882; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2883; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2884; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 2885; GFX12-WGP-NEXT: s_endpgm 2886; 2887; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: 2888; GFX12-CU: ; %bb.0: ; %entry 2889; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2890; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2891; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2892; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2893; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 2894; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 2895; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2896; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 2897; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2898; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2899; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2900; GFX12-CU-NEXT: s_endpgm 2901 ptr %out, i32 %in, i32 %old) { 2902entry: 2903 %gep = getelementptr i32, ptr %out, i32 4 2904 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic 2905 ret void 2906} 2907 2908define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( 2909; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 2910; GFX7: ; %bb.0: ; %entry 2911; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2912; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2913; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 2914; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 2915; GFX7-NEXT: s_mov_b64 s[10:11], 16 2916; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2917; GFX7-NEXT: s_mov_b32 s4, s8 2918; GFX7-NEXT: s_mov_b32 s5, s9 2919; GFX7-NEXT: s_mov_b32 s9, s10 2920; GFX7-NEXT: s_mov_b32 s8, s11 2921; GFX7-NEXT: s_add_u32 s4, s4, s9 2922; GFX7-NEXT: s_addc_u32 s8, s5, s8 2923; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2924; GFX7-NEXT: s_mov_b32 s5, s8 2925; GFX7-NEXT: v_mov_b32_e32 v2, s7 2926; GFX7-NEXT: v_mov_b32_e32 v0, s6 2927; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2928; GFX7-NEXT: v_mov_b32_e32 v3, v0 2929; GFX7-NEXT: v_mov_b32_e32 v0, s4 2930; GFX7-NEXT: v_mov_b32_e32 v1, s5 2931; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2932; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX7-NEXT: s_endpgm 2934; 2935; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 2936; GFX10-WGP: ; %bb.0: ; %entry 2937; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 2938; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2939; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 2940; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 2941; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 2942; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2943; GFX10-WGP-NEXT: s_mov_b32 s4, s8 2944; GFX10-WGP-NEXT: s_mov_b32 s5, s9 2945; GFX10-WGP-NEXT: s_mov_b32 s9, s10 2946; GFX10-WGP-NEXT: s_mov_b32 s8, s11 2947; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 2948; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 2949; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2950; GFX10-WGP-NEXT: s_mov_b32 s5, s8 2951; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 2952; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 2953; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2954; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 2955; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2956; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2957; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2958; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2959; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2960; GFX10-WGP-NEXT: buffer_gl0_inv 2961; GFX10-WGP-NEXT: s_endpgm 2962; 2963; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 2964; GFX10-CU: ; %bb.0: ; %entry 2965; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 2966; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2967; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 2968; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 2969; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 2970; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2971; GFX10-CU-NEXT: s_mov_b32 s4, s8 2972; GFX10-CU-NEXT: s_mov_b32 s5, s9 2973; GFX10-CU-NEXT: s_mov_b32 s9, s10 2974; GFX10-CU-NEXT: s_mov_b32 s8, s11 2975; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 2976; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 2977; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2978; GFX10-CU-NEXT: s_mov_b32 s5, s8 2979; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 2980; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 2981; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2982; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 2983; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2984; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2985; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2986; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX10-CU-NEXT: s_endpgm 2988; 2989; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 2990; SKIP-CACHE-INV: ; %bb.0: ; %entry 2991; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 2992; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 2993; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 2994; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 2995; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 2996; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2997; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2998; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2999; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3000; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3001; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3002; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3003; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3004; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3005; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3006; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3007; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3008; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3009; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3010; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3011; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3012; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3013; SKIP-CACHE-INV-NEXT: s_endpgm 3014; 3015; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3016; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3017; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3018; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3019; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3020; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3021; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3022; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3023; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3024; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3025; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3026; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3027; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3028; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3029; 3030; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3031; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3032; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3033; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3034; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3035; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3036; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3037; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3038; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3039; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3040; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3041; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3042; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3043; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3044; GFX90A-TGSPLIT-NEXT: s_endpgm 3045; 3046; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3047; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3048; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3049; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3050; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3051; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3052; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3053; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3054; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3055; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3056; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3057; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3058; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3060; 3061; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3062; GFX940-TGSPLIT: ; %bb.0: ; %entry 3063; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3064; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3065; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3066; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3067; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3068; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3069; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3070; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3071; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3072; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3073; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3074; GFX940-TGSPLIT-NEXT: buffer_inv sc0 3075; GFX940-TGSPLIT-NEXT: s_endpgm 3076; 3077; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3078; GFX11-WGP: ; %bb.0: ; %entry 3079; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3080; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3081; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3082; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3083; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3084; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3085; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3086; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3087; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3088; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3089; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3090; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3091; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3092; GFX11-WGP-NEXT: buffer_gl0_inv 3093; GFX11-WGP-NEXT: s_endpgm 3094; 3095; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3096; GFX11-CU: ; %bb.0: ; %entry 3097; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3098; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3099; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3100; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3101; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3102; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3103; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3104; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3105; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3106; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3107; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3108; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3109; GFX11-CU-NEXT: s_endpgm 3110; 3111; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3112; GFX12-WGP: ; %bb.0: ; %entry 3113; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3114; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3115; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3116; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3117; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3118; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3119; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3120; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3121; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3122; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3123; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 3124; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 3125; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 3126; GFX12-WGP-NEXT: s_endpgm 3127; 3128; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: 3129; GFX12-CU: ; %bb.0: ; %entry 3130; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3131; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3132; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3133; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3134; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3135; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3136; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3137; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3138; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3139; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3140; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3141; GFX12-CU-NEXT: s_wait_dscnt 0x0 3142; GFX12-CU-NEXT: s_endpgm 3143 ptr %out, i32 %in, i32 %old) { 3144entry: 3145 %gep = getelementptr i32, ptr %out, i32 4 3146 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 3147 ret void 3148} 3149 3150define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( 3151; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3152; GFX7: ; %bb.0: ; %entry 3153; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3154; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3155; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3156; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3157; GFX7-NEXT: s_mov_b64 s[10:11], 16 3158; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3159; GFX7-NEXT: s_mov_b32 s4, s8 3160; GFX7-NEXT: s_mov_b32 s5, s9 3161; GFX7-NEXT: s_mov_b32 s9, s10 3162; GFX7-NEXT: s_mov_b32 s8, s11 3163; GFX7-NEXT: s_add_u32 s4, s4, s9 3164; GFX7-NEXT: s_addc_u32 s8, s5, s8 3165; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3166; GFX7-NEXT: s_mov_b32 s5, s8 3167; GFX7-NEXT: v_mov_b32_e32 v2, s7 3168; GFX7-NEXT: v_mov_b32_e32 v0, s6 3169; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3170; GFX7-NEXT: v_mov_b32_e32 v3, v0 3171; GFX7-NEXT: v_mov_b32_e32 v0, s4 3172; GFX7-NEXT: v_mov_b32_e32 v1, s5 3173; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3174; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3175; GFX7-NEXT: s_endpgm 3176; 3177; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3178; GFX10-WGP: ; %bb.0: ; %entry 3179; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3180; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3181; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3182; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3183; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3184; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3185; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3186; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3187; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3188; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3189; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3190; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3191; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3192; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3193; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3194; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3195; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3196; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3197; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3198; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3199; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3200; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3201; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3202; GFX10-WGP-NEXT: s_endpgm 3203; 3204; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3205; GFX10-CU: ; %bb.0: ; %entry 3206; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3207; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3208; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3209; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3210; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3211; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3212; GFX10-CU-NEXT: s_mov_b32 s4, s8 3213; GFX10-CU-NEXT: s_mov_b32 s5, s9 3214; GFX10-CU-NEXT: s_mov_b32 s9, s10 3215; GFX10-CU-NEXT: s_mov_b32 s8, s11 3216; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3217; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3218; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3219; GFX10-CU-NEXT: s_mov_b32 s5, s8 3220; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3221; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3222; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3223; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3224; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3225; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3226; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3228; GFX10-CU-NEXT: s_endpgm 3229; 3230; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3231; SKIP-CACHE-INV: ; %bb.0: ; %entry 3232; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3233; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3234; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3235; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3236; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3237; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3238; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3239; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3240; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3241; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3242; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3243; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3244; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3245; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3246; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3247; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3248; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3252; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3253; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3254; SKIP-CACHE-INV-NEXT: s_endpgm 3255; 3256; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3257; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3258; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3259; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3260; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3261; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3262; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3263; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3264; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3265; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3266; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3267; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3268; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3269; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3270; 3271; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3272; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3273; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3274; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3275; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3276; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3277; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3278; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3279; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3280; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3281; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3282; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3283; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3284; GFX90A-TGSPLIT-NEXT: s_endpgm 3285; 3286; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3287; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3288; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3289; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3290; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3291; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3292; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3293; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3294; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3295; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3296; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3297; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3298; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3299; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3300; 3301; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3302; GFX940-TGSPLIT: ; %bb.0: ; %entry 3303; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3304; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3305; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3306; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3307; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3308; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3309; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3310; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3311; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3312; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3313; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3314; GFX940-TGSPLIT-NEXT: s_endpgm 3315; 3316; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3317; GFX11-WGP: ; %bb.0: ; %entry 3318; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3319; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3320; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3321; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3322; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3323; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3324; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3325; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3326; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3327; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3328; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3329; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3330; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3331; GFX11-WGP-NEXT: s_endpgm 3332; 3333; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3334; GFX11-CU: ; %bb.0: ; %entry 3335; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3336; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3337; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3338; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3339; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3340; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3341; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3342; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3343; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3344; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3345; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3346; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3347; GFX11-CU-NEXT: s_endpgm 3348; 3349; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3350; GFX12-WGP: ; %bb.0: ; %entry 3351; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3352; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3353; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3354; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3355; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3356; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3357; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3358; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3359; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3360; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3361; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 3362; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 3363; GFX12-WGP-NEXT: s_wait_storecnt 0x0 3364; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 3365; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 3366; GFX12-WGP-NEXT: s_endpgm 3367; 3368; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: 3369; GFX12-CU: ; %bb.0: ; %entry 3370; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3371; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3372; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3373; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3374; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3375; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3376; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3377; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3378; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3379; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3380; GFX12-CU-NEXT: s_wait_dscnt 0x0 3381; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3382; GFX12-CU-NEXT: s_endpgm 3383 ptr %out, i32 %in, i32 %old) { 3384entry: 3385 %gep = getelementptr i32, ptr %out, i32 4 3386 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic 3387 ret void 3388} 3389 3390define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( 3391; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3392; GFX7: ; %bb.0: ; %entry 3393; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3394; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3395; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3396; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3397; GFX7-NEXT: s_mov_b64 s[10:11], 16 3398; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3399; GFX7-NEXT: s_mov_b32 s4, s8 3400; GFX7-NEXT: s_mov_b32 s5, s9 3401; GFX7-NEXT: s_mov_b32 s9, s10 3402; GFX7-NEXT: s_mov_b32 s8, s11 3403; GFX7-NEXT: s_add_u32 s4, s4, s9 3404; GFX7-NEXT: s_addc_u32 s8, s5, s8 3405; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3406; GFX7-NEXT: s_mov_b32 s5, s8 3407; GFX7-NEXT: v_mov_b32_e32 v2, s7 3408; GFX7-NEXT: v_mov_b32_e32 v0, s6 3409; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3410; GFX7-NEXT: v_mov_b32_e32 v3, v0 3411; GFX7-NEXT: v_mov_b32_e32 v0, s4 3412; GFX7-NEXT: v_mov_b32_e32 v1, s5 3413; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3414; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3415; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3416; GFX7-NEXT: s_endpgm 3417; 3418; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3419; GFX10-WGP: ; %bb.0: ; %entry 3420; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3421; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3422; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3423; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3424; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3425; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3426; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3427; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3428; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3429; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3430; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3431; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3432; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3433; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3434; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3435; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3436; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3437; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3438; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3439; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3440; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3441; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3442; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3443; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3444; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3445; GFX10-WGP-NEXT: buffer_gl0_inv 3446; GFX10-WGP-NEXT: s_endpgm 3447; 3448; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3449; GFX10-CU: ; %bb.0: ; %entry 3450; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3451; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3452; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3453; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3454; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3455; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3456; GFX10-CU-NEXT: s_mov_b32 s4, s8 3457; GFX10-CU-NEXT: s_mov_b32 s5, s9 3458; GFX10-CU-NEXT: s_mov_b32 s9, s10 3459; GFX10-CU-NEXT: s_mov_b32 s8, s11 3460; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3461; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3462; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3463; GFX10-CU-NEXT: s_mov_b32 s5, s8 3464; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3465; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3466; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3467; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3468; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3469; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3470; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3471; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3472; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3473; GFX10-CU-NEXT: s_endpgm 3474; 3475; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3476; SKIP-CACHE-INV: ; %bb.0: ; %entry 3477; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3478; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3479; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3480; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3481; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3482; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3483; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3484; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3485; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3486; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3487; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3488; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3489; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3490; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3491; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3493; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3495; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3497; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3498; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3499; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3500; SKIP-CACHE-INV-NEXT: s_endpgm 3501; 3502; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3503; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3504; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3505; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3506; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3507; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3509; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3510; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3511; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3512; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3513; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3514; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3515; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3516; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3517; 3518; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3519; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3520; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3521; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3522; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3523; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3524; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3525; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3526; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3527; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3528; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3529; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3530; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3531; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3532; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3533; GFX90A-TGSPLIT-NEXT: s_endpgm 3534; 3535; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3536; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3537; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3538; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3539; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3540; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3541; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3542; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3543; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3544; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3545; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3546; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3547; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3548; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3549; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3550; 3551; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3552; GFX940-TGSPLIT: ; %bb.0: ; %entry 3553; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3554; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3555; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3556; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3557; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3558; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3559; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3560; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3561; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3562; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3563; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3564; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3565; GFX940-TGSPLIT-NEXT: buffer_inv sc0 3566; GFX940-TGSPLIT-NEXT: s_endpgm 3567; 3568; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3569; GFX11-WGP: ; %bb.0: ; %entry 3570; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3571; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3572; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3573; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3574; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3575; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3576; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3577; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3578; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3579; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3580; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3581; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3582; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3583; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3585; GFX11-WGP-NEXT: buffer_gl0_inv 3586; GFX11-WGP-NEXT: s_endpgm 3587; 3588; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3589; GFX11-CU: ; %bb.0: ; %entry 3590; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3591; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3592; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3593; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3594; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3595; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3596; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3597; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3598; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3599; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3600; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3601; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3602; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3603; GFX11-CU-NEXT: s_endpgm 3604; 3605; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3606; GFX12-WGP: ; %bb.0: ; %entry 3607; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3608; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3609; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3610; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3611; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3612; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3613; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3614; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3615; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3616; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3617; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 3618; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 3619; GFX12-WGP-NEXT: s_wait_storecnt 0x0 3620; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 3621; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 3622; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 3623; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 3624; GFX12-WGP-NEXT: s_endpgm 3625; 3626; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: 3627; GFX12-CU: ; %bb.0: ; %entry 3628; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3629; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3630; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3631; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3632; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3633; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3634; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3635; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3636; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3637; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3638; GFX12-CU-NEXT: s_wait_dscnt 0x0 3639; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3640; GFX12-CU-NEXT: s_wait_dscnt 0x0 3641; GFX12-CU-NEXT: s_endpgm 3642 ptr %out, i32 %in, i32 %old) { 3643entry: 3644 %gep = getelementptr i32, ptr %out, i32 4 3645 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 3646 ret void 3647} 3648 3649define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( 3650; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3651; GFX7: ; %bb.0: ; %entry 3652; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3653; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3654; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3655; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3656; GFX7-NEXT: s_mov_b64 s[10:11], 16 3657; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3658; GFX7-NEXT: s_mov_b32 s4, s8 3659; GFX7-NEXT: s_mov_b32 s5, s9 3660; GFX7-NEXT: s_mov_b32 s9, s10 3661; GFX7-NEXT: s_mov_b32 s8, s11 3662; GFX7-NEXT: s_add_u32 s4, s4, s9 3663; GFX7-NEXT: s_addc_u32 s8, s5, s8 3664; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3665; GFX7-NEXT: s_mov_b32 s5, s8 3666; GFX7-NEXT: v_mov_b32_e32 v2, s7 3667; GFX7-NEXT: v_mov_b32_e32 v0, s6 3668; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3669; GFX7-NEXT: v_mov_b32_e32 v3, v0 3670; GFX7-NEXT: v_mov_b32_e32 v0, s4 3671; GFX7-NEXT: v_mov_b32_e32 v1, s5 3672; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3673; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3674; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3675; GFX7-NEXT: s_endpgm 3676; 3677; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3678; GFX10-WGP: ; %bb.0: ; %entry 3679; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3680; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3681; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3682; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3683; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3684; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3685; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3686; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3687; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3688; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3689; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3690; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3691; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3692; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3693; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3694; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3695; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3696; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3697; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3698; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3699; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3700; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3701; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3702; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3703; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3704; GFX10-WGP-NEXT: buffer_gl0_inv 3705; GFX10-WGP-NEXT: s_endpgm 3706; 3707; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3708; GFX10-CU: ; %bb.0: ; %entry 3709; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3710; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3711; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3712; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3713; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3714; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3715; GFX10-CU-NEXT: s_mov_b32 s4, s8 3716; GFX10-CU-NEXT: s_mov_b32 s5, s9 3717; GFX10-CU-NEXT: s_mov_b32 s9, s10 3718; GFX10-CU-NEXT: s_mov_b32 s8, s11 3719; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3720; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3721; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3722; GFX10-CU-NEXT: s_mov_b32 s5, s8 3723; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3724; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3725; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3726; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3727; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3728; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3729; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3731; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3732; GFX10-CU-NEXT: s_endpgm 3733; 3734; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3735; SKIP-CACHE-INV: ; %bb.0: ; %entry 3736; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3737; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3738; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3739; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3740; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3741; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3742; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3743; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3744; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3745; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3746; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3747; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3748; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3749; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3750; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3751; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3752; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3754; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3756; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3757; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3758; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3759; SKIP-CACHE-INV-NEXT: s_endpgm 3760; 3761; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3762; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3763; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3764; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3765; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3766; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3767; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3768; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3769; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3770; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3771; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3772; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3773; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3774; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3775; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3776; 3777; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3778; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3779; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3780; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3781; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3782; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3783; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3784; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3785; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3786; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3787; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3788; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3789; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3790; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3791; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 3792; GFX90A-TGSPLIT-NEXT: s_endpgm 3793; 3794; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3795; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3796; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3797; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3798; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3799; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3800; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3801; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3802; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3803; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3804; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3805; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3806; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3807; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3808; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3809; 3810; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3811; GFX940-TGSPLIT: ; %bb.0: ; %entry 3812; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3813; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3814; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3815; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3816; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3817; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3818; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3819; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3820; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3821; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3822; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3823; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 3824; GFX940-TGSPLIT-NEXT: buffer_inv sc0 3825; GFX940-TGSPLIT-NEXT: s_endpgm 3826; 3827; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3828; GFX11-WGP: ; %bb.0: ; %entry 3829; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3830; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3831; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3832; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3833; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3834; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3835; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3836; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3837; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3838; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3839; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3840; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3841; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3842; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3843; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3844; GFX11-WGP-NEXT: buffer_gl0_inv 3845; GFX11-WGP-NEXT: s_endpgm 3846; 3847; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3848; GFX11-CU: ; %bb.0: ; %entry 3849; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3850; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3851; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3852; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3853; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3854; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3855; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3856; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3857; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3858; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3859; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3860; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3861; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3862; GFX11-CU-NEXT: s_endpgm 3863; 3864; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3865; GFX12-WGP: ; %bb.0: ; %entry 3866; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3867; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3868; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3869; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3870; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3871; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3872; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3873; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3874; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3875; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3876; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 3877; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 3878; GFX12-WGP-NEXT: s_wait_storecnt 0x0 3879; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 3880; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 3881; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 3882; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 3883; GFX12-WGP-NEXT: s_endpgm 3884; 3885; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: 3886; GFX12-CU: ; %bb.0: ; %entry 3887; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3888; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3889; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3890; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3891; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3892; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3893; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3894; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3895; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3896; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3897; GFX12-CU-NEXT: s_wait_dscnt 0x0 3898; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3899; GFX12-CU-NEXT: s_wait_dscnt 0x0 3900; GFX12-CU-NEXT: s_endpgm 3901 ptr %out, i32 %in, i32 %old) { 3902entry: 3903 %gep = getelementptr i32, ptr %out, i32 4 3904 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 3905 ret void 3906} 3907 3908define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( 3909; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 3910; GFX7: ; %bb.0: ; %entry 3911; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3912; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3913; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3914; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3915; GFX7-NEXT: s_mov_b64 s[10:11], 16 3916; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3917; GFX7-NEXT: s_mov_b32 s4, s8 3918; GFX7-NEXT: s_mov_b32 s5, s9 3919; GFX7-NEXT: s_mov_b32 s9, s10 3920; GFX7-NEXT: s_mov_b32 s8, s11 3921; GFX7-NEXT: s_add_u32 s4, s4, s9 3922; GFX7-NEXT: s_addc_u32 s8, s5, s8 3923; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3924; GFX7-NEXT: s_mov_b32 s5, s8 3925; GFX7-NEXT: v_mov_b32_e32 v2, s7 3926; GFX7-NEXT: v_mov_b32_e32 v0, s6 3927; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3928; GFX7-NEXT: v_mov_b32_e32 v3, v0 3929; GFX7-NEXT: v_mov_b32_e32 v0, s4 3930; GFX7-NEXT: v_mov_b32_e32 v1, s5 3931; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3932; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3933; GFX7-NEXT: s_endpgm 3934; 3935; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 3936; GFX10-WGP: ; %bb.0: ; %entry 3937; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3938; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3939; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3940; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3941; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3942; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3943; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3944; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3945; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3946; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3947; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3948; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3949; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3950; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3951; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3952; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3953; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3954; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3955; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3956; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3957; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3958; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3959; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3960; GFX10-WGP-NEXT: buffer_gl0_inv 3961; GFX10-WGP-NEXT: s_endpgm 3962; 3963; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 3964; GFX10-CU: ; %bb.0: ; %entry 3965; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3966; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3967; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3968; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3969; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3970; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3971; GFX10-CU-NEXT: s_mov_b32 s4, s8 3972; GFX10-CU-NEXT: s_mov_b32 s5, s9 3973; GFX10-CU-NEXT: s_mov_b32 s9, s10 3974; GFX10-CU-NEXT: s_mov_b32 s8, s11 3975; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3976; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3977; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3978; GFX10-CU-NEXT: s_mov_b32 s5, s8 3979; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3980; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3981; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3982; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3983; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3984; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3985; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3986; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3987; GFX10-CU-NEXT: s_endpgm 3988; 3989; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 3990; SKIP-CACHE-INV: ; %bb.0: ; %entry 3991; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3992; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3993; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3994; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3995; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3996; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3997; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3998; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3999; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4000; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4001; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4002; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4003; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4004; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4005; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4006; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4007; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4008; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4009; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4010; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4011; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4012; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4013; SKIP-CACHE-INV-NEXT: s_endpgm 4014; 4015; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4016; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4017; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4018; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4019; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4020; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4021; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4022; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4023; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4024; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4025; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4026; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4027; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4028; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4029; 4030; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4031; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4032; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4033; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4034; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4035; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4036; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4037; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4038; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4039; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4040; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4041; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4042; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4043; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4044; GFX90A-TGSPLIT-NEXT: s_endpgm 4045; 4046; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4047; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4048; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4049; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4050; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4051; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4052; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4053; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4054; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4055; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4056; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4057; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4058; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4059; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4060; 4061; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4062; GFX940-TGSPLIT: ; %bb.0: ; %entry 4063; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4064; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4065; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4066; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4067; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4068; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4069; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4070; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4071; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4072; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4073; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4074; GFX940-TGSPLIT-NEXT: buffer_inv sc0 4075; GFX940-TGSPLIT-NEXT: s_endpgm 4076; 4077; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4078; GFX11-WGP: ; %bb.0: ; %entry 4079; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4080; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4081; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4082; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4083; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4084; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4085; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4086; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4087; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4088; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4089; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4090; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4091; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4092; GFX11-WGP-NEXT: buffer_gl0_inv 4093; GFX11-WGP-NEXT: s_endpgm 4094; 4095; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4096; GFX11-CU: ; %bb.0: ; %entry 4097; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4098; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4099; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4100; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4101; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4102; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4103; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4104; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4105; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4106; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4107; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4108; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4109; GFX11-CU-NEXT: s_endpgm 4110; 4111; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4112; GFX12-WGP: ; %bb.0: ; %entry 4113; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4114; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4115; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4116; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4117; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4118; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4119; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4120; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4121; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4122; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4123; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 4124; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 4125; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 4126; GFX12-WGP-NEXT: s_endpgm 4127; 4128; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: 4129; GFX12-CU: ; %bb.0: ; %entry 4130; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4131; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4132; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4133; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4134; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4135; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4136; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4137; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4138; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4139; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4140; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4141; GFX12-CU-NEXT: s_wait_dscnt 0x0 4142; GFX12-CU-NEXT: s_endpgm 4143 ptr %out, i32 %in, i32 %old) { 4144entry: 4145 %gep = getelementptr i32, ptr %out, i32 4 4146 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire 4147 ret void 4148} 4149 4150define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( 4151; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4152; GFX7: ; %bb.0: ; %entry 4153; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4154; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4155; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4156; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4157; GFX7-NEXT: s_mov_b64 s[10:11], 16 4158; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4159; GFX7-NEXT: s_mov_b32 s4, s8 4160; GFX7-NEXT: s_mov_b32 s5, s9 4161; GFX7-NEXT: s_mov_b32 s9, s10 4162; GFX7-NEXT: s_mov_b32 s8, s11 4163; GFX7-NEXT: s_add_u32 s4, s4, s9 4164; GFX7-NEXT: s_addc_u32 s8, s5, s8 4165; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4166; GFX7-NEXT: s_mov_b32 s5, s8 4167; GFX7-NEXT: v_mov_b32_e32 v2, s7 4168; GFX7-NEXT: v_mov_b32_e32 v0, s6 4169; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4170; GFX7-NEXT: v_mov_b32_e32 v3, v0 4171; GFX7-NEXT: v_mov_b32_e32 v0, s4 4172; GFX7-NEXT: v_mov_b32_e32 v1, s5 4173; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4175; GFX7-NEXT: s_endpgm 4176; 4177; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4178; GFX10-WGP: ; %bb.0: ; %entry 4179; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4180; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4181; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4182; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4183; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4184; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4185; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4186; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4187; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4188; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4189; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4190; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4191; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4192; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4193; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4194; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4195; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4196; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4197; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4198; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4199; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4200; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4201; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4202; GFX10-WGP-NEXT: buffer_gl0_inv 4203; GFX10-WGP-NEXT: s_endpgm 4204; 4205; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4206; GFX10-CU: ; %bb.0: ; %entry 4207; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4208; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4209; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4210; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4211; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4212; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4213; GFX10-CU-NEXT: s_mov_b32 s4, s8 4214; GFX10-CU-NEXT: s_mov_b32 s5, s9 4215; GFX10-CU-NEXT: s_mov_b32 s9, s10 4216; GFX10-CU-NEXT: s_mov_b32 s8, s11 4217; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4218; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4219; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4220; GFX10-CU-NEXT: s_mov_b32 s5, s8 4221; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4222; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4223; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4224; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4225; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4226; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4227; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4228; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4229; GFX10-CU-NEXT: s_endpgm 4230; 4231; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4232; SKIP-CACHE-INV: ; %bb.0: ; %entry 4233; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4234; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4235; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4236; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4237; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4238; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4239; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4240; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4241; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4242; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4243; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4244; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4245; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4246; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4247; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4248; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4249; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4252; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4253; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4254; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4255; SKIP-CACHE-INV-NEXT: s_endpgm 4256; 4257; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4258; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4259; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4260; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4261; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4262; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4263; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4264; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4265; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4266; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4267; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4268; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4269; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4270; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4271; 4272; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4273; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4274; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4275; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4276; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4277; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4278; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4279; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4280; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4281; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4282; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4283; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4284; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4285; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4286; GFX90A-TGSPLIT-NEXT: s_endpgm 4287; 4288; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4289; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4290; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4291; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4292; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4293; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4294; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4295; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4296; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4297; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4298; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4299; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4300; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4301; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4302; 4303; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4304; GFX940-TGSPLIT: ; %bb.0: ; %entry 4305; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4306; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4307; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4308; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4309; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4310; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4311; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4312; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4313; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4314; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4315; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4316; GFX940-TGSPLIT-NEXT: buffer_inv sc0 4317; GFX940-TGSPLIT-NEXT: s_endpgm 4318; 4319; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4320; GFX11-WGP: ; %bb.0: ; %entry 4321; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4322; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4323; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4324; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4325; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4326; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4327; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4328; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4329; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4330; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4331; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4332; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4334; GFX11-WGP-NEXT: buffer_gl0_inv 4335; GFX11-WGP-NEXT: s_endpgm 4336; 4337; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4338; GFX11-CU: ; %bb.0: ; %entry 4339; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4340; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4341; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4342; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4343; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4344; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4345; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4346; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4347; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4348; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4349; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4350; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4351; GFX11-CU-NEXT: s_endpgm 4352; 4353; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4354; GFX12-WGP: ; %bb.0: ; %entry 4355; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4356; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4357; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4358; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4359; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4360; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4361; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4362; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4363; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4364; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4365; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 4366; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 4367; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 4368; GFX12-WGP-NEXT: s_endpgm 4369; 4370; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: 4371; GFX12-CU: ; %bb.0: ; %entry 4372; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4373; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4374; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4375; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4376; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4377; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4378; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4379; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4380; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4381; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4382; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4383; GFX12-CU-NEXT: s_wait_dscnt 0x0 4384; GFX12-CU-NEXT: s_endpgm 4385 ptr %out, i32 %in, i32 %old) { 4386entry: 4387 %gep = getelementptr i32, ptr %out, i32 4 4388 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 4389 ret void 4390} 4391 4392define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( 4393; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: 4394; GFX7: ; %bb.0: ; %entry 4395; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4396; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4397; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4398; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4399; GFX7-NEXT: s_mov_b64 s[10:11], 16 4400; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4401; GFX7-NEXT: s_mov_b32 s4, s8 4402; GFX7-NEXT: s_mov_b32 s5, s9 4403; GFX7-NEXT: s_mov_b32 s9, s10 4404; GFX7-NEXT: s_mov_b32 s8, s11 4405; GFX7-NEXT: s_add_u32 s4, s4, s9 4406; GFX7-NEXT: s_addc_u32 s8, s5, s8 4407; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4408; GFX7-NEXT: s_mov_b32 s5, s8 4409; GFX7-NEXT: v_mov_b32_e32 v2, s7 4410; GFX7-NEXT: v_mov_b32_e32 v0, s6 4411; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4412; GFX7-NEXT: v_mov_b32_e32 v3, v0 4413; GFX7-NEXT: v_mov_b32_e32 v0, s4 4414; GFX7-NEXT: v_mov_b32_e32 v1, s5 4415; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4416; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4417; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4418; GFX7-NEXT: s_endpgm 4419; 4420; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: 4421; GFX10-WGP: ; %bb.0: ; %entry 4422; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4423; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4424; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4425; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4426; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4427; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4428; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4429; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4430; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4431; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4432; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4433; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4434; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4435; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4436; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4437; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4438; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4439; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4440; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4441; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4442; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4443; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4444; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4445; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4446; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4447; GFX10-WGP-NEXT: buffer_gl0_inv 4448; GFX10-WGP-NEXT: s_endpgm 4449; 4450; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: 4451; GFX10-CU: ; %bb.0: ; %entry 4452; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4453; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4454; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4455; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4456; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4457; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4458; GFX10-CU-NEXT: s_mov_b32 s4, s8 4459; GFX10-CU-NEXT: s_mov_b32 s5, s9 4460; GFX10-CU-NEXT: s_mov_b32 s9, s10 4461; GFX10-CU-NEXT: s_mov_b32 s8, s11 4462; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4463; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4464; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4465; GFX10-CU-NEXT: s_mov_b32 s5, s8 4466; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4467; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4468; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4469; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4470; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4471; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4472; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4473; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4474; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4475; GFX10-CU-NEXT: s_endpgm 4476; 4477; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg: 4478; SKIP-CACHE-INV: ; %bb.0: ; %entry 4479; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4480; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4481; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4482; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4483; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4484; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4485; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4486; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4487; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4488; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4489; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4490; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4491; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4492; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4493; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4495; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4497; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4498; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4499; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4500; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4501; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4502; SKIP-CACHE-INV-NEXT: s_endpgm 4503; 4504; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: 4505; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4506; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4507; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4508; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4509; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4510; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4511; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4512; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4513; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4514; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4515; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4516; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4517; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4518; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4519; 4520; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: 4521; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4522; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4523; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4524; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4525; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4526; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4527; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4528; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4529; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4530; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4531; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4532; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4533; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4534; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4535; GFX90A-TGSPLIT-NEXT: s_endpgm 4536; 4537; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: 4538; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4539; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4540; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4541; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4542; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4543; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4544; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4545; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4546; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4547; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4548; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4549; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4550; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4551; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4552; 4553; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: 4554; GFX940-TGSPLIT: ; %bb.0: ; %entry 4555; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4556; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4557; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4558; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4559; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4560; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4561; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4562; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4563; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4564; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4565; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4566; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4567; GFX940-TGSPLIT-NEXT: buffer_inv sc0 4568; GFX940-TGSPLIT-NEXT: s_endpgm 4569; 4570; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: 4571; GFX11-WGP: ; %bb.0: ; %entry 4572; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4573; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4574; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4575; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4576; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4577; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4578; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4579; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4580; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4581; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4582; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4583; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4584; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4585; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4586; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4587; GFX11-WGP-NEXT: buffer_gl0_inv 4588; GFX11-WGP-NEXT: s_endpgm 4589; 4590; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: 4591; GFX11-CU: ; %bb.0: ; %entry 4592; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4593; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4594; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4595; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4596; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4597; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4598; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4599; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4600; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4601; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4602; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4603; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4604; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4605; GFX11-CU-NEXT: s_endpgm 4606; 4607; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: 4608; GFX12-WGP: ; %bb.0: ; %entry 4609; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4610; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4611; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4612; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4613; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4614; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4615; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4616; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4617; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4618; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4619; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 4620; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 4621; GFX12-WGP-NEXT: s_wait_storecnt 0x0 4622; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 4623; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 4624; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 4625; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 4626; GFX12-WGP-NEXT: s_endpgm 4627; 4628; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: 4629; GFX12-CU: ; %bb.0: ; %entry 4630; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4631; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4632; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4633; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4634; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4635; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4636; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4637; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4638; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4639; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4640; GFX12-CU-NEXT: s_wait_dscnt 0x0 4641; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4642; GFX12-CU-NEXT: s_wait_dscnt 0x0 4643; GFX12-CU-NEXT: s_endpgm 4644 ptr %out, i32 %in, i32 %old) { 4645entry: 4646 %gep = getelementptr i32, ptr %out, i32 4 4647 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 4648 ret void 4649} 4650 4651define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( 4652; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4653; GFX7: ; %bb.0: ; %entry 4654; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4655; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4656; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4657; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4658; GFX7-NEXT: s_mov_b64 s[10:11], 16 4659; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4660; GFX7-NEXT: s_mov_b32 s4, s8 4661; GFX7-NEXT: s_mov_b32 s5, s9 4662; GFX7-NEXT: s_mov_b32 s9, s10 4663; GFX7-NEXT: s_mov_b32 s8, s11 4664; GFX7-NEXT: s_add_u32 s4, s4, s9 4665; GFX7-NEXT: s_addc_u32 s8, s5, s8 4666; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4667; GFX7-NEXT: s_mov_b32 s5, s8 4668; GFX7-NEXT: v_mov_b32_e32 v2, s7 4669; GFX7-NEXT: v_mov_b32_e32 v0, s6 4670; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4671; GFX7-NEXT: v_mov_b32_e32 v3, v0 4672; GFX7-NEXT: v_mov_b32_e32 v0, s4 4673; GFX7-NEXT: v_mov_b32_e32 v1, s5 4674; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4675; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4676; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4677; GFX7-NEXT: s_endpgm 4678; 4679; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4680; GFX10-WGP: ; %bb.0: ; %entry 4681; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4682; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4683; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4684; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4685; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4686; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4687; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4688; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4689; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4690; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4691; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4692; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4693; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4694; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4695; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4696; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4697; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4698; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4699; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4700; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4701; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4702; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4703; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4704; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4705; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4706; GFX10-WGP-NEXT: buffer_gl0_inv 4707; GFX10-WGP-NEXT: s_endpgm 4708; 4709; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4710; GFX10-CU: ; %bb.0: ; %entry 4711; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4712; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4713; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4714; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4715; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4716; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4717; GFX10-CU-NEXT: s_mov_b32 s4, s8 4718; GFX10-CU-NEXT: s_mov_b32 s5, s9 4719; GFX10-CU-NEXT: s_mov_b32 s9, s10 4720; GFX10-CU-NEXT: s_mov_b32 s8, s11 4721; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4722; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4723; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4724; GFX10-CU-NEXT: s_mov_b32 s5, s8 4725; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4726; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4727; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4728; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4729; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4730; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4731; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4732; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4733; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4734; GFX10-CU-NEXT: s_endpgm 4735; 4736; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4737; SKIP-CACHE-INV: ; %bb.0: ; %entry 4738; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4739; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4740; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4741; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4742; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4743; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4744; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4745; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4746; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4747; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4748; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4749; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4750; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4751; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4754; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4756; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4757; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4758; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4759; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4760; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4761; SKIP-CACHE-INV-NEXT: s_endpgm 4762; 4763; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4764; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4765; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4766; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4767; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4768; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4769; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4770; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4771; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4772; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4773; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4774; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4775; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4776; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4777; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4778; 4779; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4780; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4781; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4782; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4783; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4784; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4785; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4786; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4787; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4788; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4789; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4790; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4791; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4792; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4793; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 4794; GFX90A-TGSPLIT-NEXT: s_endpgm 4795; 4796; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4797; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4798; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4799; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4800; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4801; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4802; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4803; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4804; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4805; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4806; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4807; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4808; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4809; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4810; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4811; 4812; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4813; GFX940-TGSPLIT: ; %bb.0: ; %entry 4814; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4815; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4816; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4817; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4819; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4820; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4821; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4822; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4823; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4824; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4825; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 4826; GFX940-TGSPLIT-NEXT: buffer_inv sc0 4827; GFX940-TGSPLIT-NEXT: s_endpgm 4828; 4829; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4830; GFX11-WGP: ; %bb.0: ; %entry 4831; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4832; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4833; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4834; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4835; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4836; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4837; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4838; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4839; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4840; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4841; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4842; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4843; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4844; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4845; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4846; GFX11-WGP-NEXT: buffer_gl0_inv 4847; GFX11-WGP-NEXT: s_endpgm 4848; 4849; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4850; GFX11-CU: ; %bb.0: ; %entry 4851; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4852; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4853; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4854; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4855; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4856; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4857; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4858; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4859; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4860; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4861; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4862; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4863; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4864; GFX11-CU-NEXT: s_endpgm 4865; 4866; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4867; GFX12-WGP: ; %bb.0: ; %entry 4868; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4869; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4870; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4871; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4872; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4873; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4874; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4875; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4876; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4877; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4878; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 4879; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 4880; GFX12-WGP-NEXT: s_wait_storecnt 0x0 4881; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 4882; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 4883; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 4884; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 4885; GFX12-WGP-NEXT: s_endpgm 4886; 4887; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: 4888; GFX12-CU: ; %bb.0: ; %entry 4889; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4890; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4891; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4892; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4893; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4894; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4895; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4896; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4897; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4898; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4899; GFX12-CU-NEXT: s_wait_dscnt 0x0 4900; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4901; GFX12-CU-NEXT: s_wait_dscnt 0x0 4902; GFX12-CU-NEXT: s_endpgm 4903 ptr %out, i32 %in, i32 %old) { 4904entry: 4905 %gep = getelementptr i32, ptr %out, i32 4 4906 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 4907 ret void 4908} 4909 4910define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( 4911; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 4912; GFX7: ; %bb.0: ; %entry 4913; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4914; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4915; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4916; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4917; GFX7-NEXT: s_mov_b64 s[10:11], 16 4918; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4919; GFX7-NEXT: s_mov_b32 s4, s8 4920; GFX7-NEXT: s_mov_b32 s5, s9 4921; GFX7-NEXT: s_mov_b32 s9, s10 4922; GFX7-NEXT: s_mov_b32 s8, s11 4923; GFX7-NEXT: s_add_u32 s4, s4, s9 4924; GFX7-NEXT: s_addc_u32 s8, s5, s8 4925; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4926; GFX7-NEXT: s_mov_b32 s5, s8 4927; GFX7-NEXT: v_mov_b32_e32 v2, s7 4928; GFX7-NEXT: v_mov_b32_e32 v0, s6 4929; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4930; GFX7-NEXT: v_mov_b32_e32 v3, v0 4931; GFX7-NEXT: v_mov_b32_e32 v0, s4 4932; GFX7-NEXT: v_mov_b32_e32 v1, s5 4933; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4934; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4935; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4936; GFX7-NEXT: s_endpgm 4937; 4938; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 4939; GFX10-WGP: ; %bb.0: ; %entry 4940; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4941; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4942; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4943; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4944; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4945; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4946; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4947; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4948; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4949; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4950; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4951; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4952; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4953; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4954; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4955; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4956; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4957; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4958; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4959; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4960; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4961; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4962; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4963; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4964; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4965; GFX10-WGP-NEXT: buffer_gl0_inv 4966; GFX10-WGP-NEXT: s_endpgm 4967; 4968; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 4969; GFX10-CU: ; %bb.0: ; %entry 4970; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4971; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4972; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4973; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4974; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4975; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4976; GFX10-CU-NEXT: s_mov_b32 s4, s8 4977; GFX10-CU-NEXT: s_mov_b32 s5, s9 4978; GFX10-CU-NEXT: s_mov_b32 s9, s10 4979; GFX10-CU-NEXT: s_mov_b32 s8, s11 4980; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4981; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4982; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4983; GFX10-CU-NEXT: s_mov_b32 s5, s8 4984; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4985; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4986; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4987; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4988; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4989; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4990; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4991; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4992; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4993; GFX10-CU-NEXT: s_endpgm 4994; 4995; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 4996; SKIP-CACHE-INV: ; %bb.0: ; %entry 4997; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4998; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4999; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 5000; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 5001; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 5002; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5003; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 5004; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 5005; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 5006; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 5007; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 5008; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 5009; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 5010; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 5011; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5012; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5013; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5014; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5015; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5016; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5017; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5018; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5019; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5020; SKIP-CACHE-INV-NEXT: s_endpgm 5021; 5022; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5023; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5024; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5025; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5026; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5027; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5028; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5029; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5030; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5031; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5032; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5033; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5034; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5035; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5036; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5037; 5038; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5039; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5040; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5041; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5042; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5043; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5044; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5045; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5046; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5047; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5048; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5049; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5050; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5051; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5052; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5053; GFX90A-TGSPLIT-NEXT: s_endpgm 5054; 5055; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5056; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5057; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5058; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5059; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5060; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5061; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5062; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5063; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5064; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5065; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5066; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5067; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5068; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5069; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5070; 5071; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5072; GFX940-TGSPLIT: ; %bb.0: ; %entry 5073; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5074; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5075; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5076; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5077; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5078; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5079; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5080; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5081; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5082; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5083; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5084; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5085; GFX940-TGSPLIT-NEXT: buffer_inv sc0 5086; GFX940-TGSPLIT-NEXT: s_endpgm 5087; 5088; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5089; GFX11-WGP: ; %bb.0: ; %entry 5090; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5091; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5092; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5093; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5094; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5095; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5096; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5097; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5098; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5099; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5100; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5101; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5102; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5103; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5104; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5105; GFX11-WGP-NEXT: buffer_gl0_inv 5106; GFX11-WGP-NEXT: s_endpgm 5107; 5108; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5109; GFX11-CU: ; %bb.0: ; %entry 5110; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5111; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5112; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5113; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5114; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5115; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5116; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5117; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5118; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5119; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5120; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5121; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5122; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5123; GFX11-CU-NEXT: s_endpgm 5124; 5125; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5126; GFX12-WGP: ; %bb.0: ; %entry 5127; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5128; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5129; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5130; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5131; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5132; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5133; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5134; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5135; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5136; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5137; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 5138; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 5139; GFX12-WGP-NEXT: s_wait_storecnt 0x0 5140; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 5141; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 5142; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 5143; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 5144; GFX12-WGP-NEXT: s_endpgm 5145; 5146; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: 5147; GFX12-CU: ; %bb.0: ; %entry 5148; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5149; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5150; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5151; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5152; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5153; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5154; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5155; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5156; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5157; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5158; GFX12-CU-NEXT: s_wait_dscnt 0x0 5159; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5160; GFX12-CU-NEXT: s_wait_dscnt 0x0 5161; GFX12-CU-NEXT: s_endpgm 5162 ptr %out, i32 %in, i32 %old) { 5163entry: 5164 %gep = getelementptr i32, ptr %out, i32 4 5165 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 5166 ret void 5167} 5168 5169define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( 5170; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5171; GFX7: ; %bb.0: ; %entry 5172; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5173; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5174; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 5175; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 5176; GFX7-NEXT: s_mov_b64 s[10:11], 16 5177; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5178; GFX7-NEXT: s_mov_b32 s4, s8 5179; GFX7-NEXT: s_mov_b32 s5, s9 5180; GFX7-NEXT: s_mov_b32 s9, s10 5181; GFX7-NEXT: s_mov_b32 s8, s11 5182; GFX7-NEXT: s_add_u32 s4, s4, s9 5183; GFX7-NEXT: s_addc_u32 s8, s5, s8 5184; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5185; GFX7-NEXT: s_mov_b32 s5, s8 5186; GFX7-NEXT: v_mov_b32_e32 v2, s7 5187; GFX7-NEXT: v_mov_b32_e32 v0, s6 5188; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5189; GFX7-NEXT: v_mov_b32_e32 v3, v0 5190; GFX7-NEXT: v_mov_b32_e32 v0, s4 5191; GFX7-NEXT: v_mov_b32_e32 v1, s5 5192; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5193; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5194; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5195; GFX7-NEXT: s_endpgm 5196; 5197; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5198; GFX10-WGP: ; %bb.0: ; %entry 5199; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 5200; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5201; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 5202; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 5203; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 5204; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5205; GFX10-WGP-NEXT: s_mov_b32 s4, s8 5206; GFX10-WGP-NEXT: s_mov_b32 s5, s9 5207; GFX10-WGP-NEXT: s_mov_b32 s9, s10 5208; GFX10-WGP-NEXT: s_mov_b32 s8, s11 5209; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 5210; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 5211; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5212; GFX10-WGP-NEXT: s_mov_b32 s5, s8 5213; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 5214; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5215; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5216; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5217; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5218; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5219; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5220; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5221; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5222; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5223; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5224; GFX10-WGP-NEXT: buffer_gl0_inv 5225; GFX10-WGP-NEXT: s_endpgm 5226; 5227; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5228; GFX10-CU: ; %bb.0: ; %entry 5229; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 5230; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5231; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 5232; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 5233; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 5234; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5235; GFX10-CU-NEXT: s_mov_b32 s4, s8 5236; GFX10-CU-NEXT: s_mov_b32 s5, s9 5237; GFX10-CU-NEXT: s_mov_b32 s9, s10 5238; GFX10-CU-NEXT: s_mov_b32 s8, s11 5239; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 5240; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 5241; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5242; GFX10-CU-NEXT: s_mov_b32 s5, s8 5243; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 5244; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5245; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5246; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5247; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5248; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5249; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5250; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5251; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5252; GFX10-CU-NEXT: s_endpgm 5253; 5254; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5255; SKIP-CACHE-INV: ; %bb.0: ; %entry 5256; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 5257; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 5258; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 5259; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 5260; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 5261; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5262; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 5263; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 5264; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 5265; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 5266; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 5267; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 5268; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 5269; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 5270; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5271; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5272; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5273; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5274; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5275; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5276; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5277; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5278; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5279; SKIP-CACHE-INV-NEXT: s_endpgm 5280; 5281; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5282; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5283; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5284; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5285; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5286; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5287; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5288; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5289; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5290; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5291; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5292; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5293; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5294; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5295; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5296; 5297; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5298; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5299; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5300; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5301; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5302; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5303; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5304; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5305; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5306; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5307; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5308; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5309; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5310; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5311; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5312; GFX90A-TGSPLIT-NEXT: s_endpgm 5313; 5314; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5315; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5316; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5317; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5318; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5319; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5320; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5321; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5322; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5323; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5324; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5325; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5326; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5327; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5328; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5329; 5330; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5331; GFX940-TGSPLIT: ; %bb.0: ; %entry 5332; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5333; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5334; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5335; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5336; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5337; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5338; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5339; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5340; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5341; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5342; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5343; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5344; GFX940-TGSPLIT-NEXT: buffer_inv sc0 5345; GFX940-TGSPLIT-NEXT: s_endpgm 5346; 5347; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5348; GFX11-WGP: ; %bb.0: ; %entry 5349; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5350; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5351; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5352; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5353; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5354; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5355; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5356; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5357; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5358; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5359; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5360; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5361; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5362; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5363; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5364; GFX11-WGP-NEXT: buffer_gl0_inv 5365; GFX11-WGP-NEXT: s_endpgm 5366; 5367; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5368; GFX11-CU: ; %bb.0: ; %entry 5369; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5370; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5371; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5372; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5373; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5374; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5375; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5376; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5377; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5378; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5379; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5380; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5381; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5382; GFX11-CU-NEXT: s_endpgm 5383; 5384; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5385; GFX12-WGP: ; %bb.0: ; %entry 5386; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5387; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5388; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5389; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5390; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5391; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5392; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5393; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5394; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5395; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5396; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 5397; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 5398; GFX12-WGP-NEXT: s_wait_storecnt 0x0 5399; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 5400; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 5401; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 5402; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 5403; GFX12-WGP-NEXT: s_endpgm 5404; 5405; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: 5406; GFX12-CU: ; %bb.0: ; %entry 5407; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5408; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5409; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5410; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5411; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5412; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5413; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5414; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5415; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5416; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5417; GFX12-CU-NEXT: s_wait_dscnt 0x0 5418; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5419; GFX12-CU-NEXT: s_wait_dscnt 0x0 5420; GFX12-CU-NEXT: s_endpgm 5421 ptr %out, i32 %in, i32 %old) { 5422entry: 5423 %gep = getelementptr i32, ptr %out, i32 4 5424 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 5425 ret void 5426} 5427 5428define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( 5429; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5430; GFX7: ; %bb.0: ; %entry 5431; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 5432; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5433; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 5434; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 5435; GFX7-NEXT: s_mov_b64 s[12:13], 16 5436; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5437; GFX7-NEXT: s_mov_b32 s6, s4 5438; GFX7-NEXT: s_mov_b32 s7, s5 5439; GFX7-NEXT: s_mov_b32 s11, s12 5440; GFX7-NEXT: s_mov_b32 s10, s13 5441; GFX7-NEXT: s_add_u32 s6, s6, s11 5442; GFX7-NEXT: s_addc_u32 s10, s7, s10 5443; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5444; GFX7-NEXT: s_mov_b32 s7, s10 5445; GFX7-NEXT: v_mov_b32_e32 v2, s9 5446; GFX7-NEXT: v_mov_b32_e32 v0, s8 5447; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5448; GFX7-NEXT: v_mov_b32_e32 v3, v0 5449; GFX7-NEXT: v_mov_b32_e32 v0, s6 5450; GFX7-NEXT: v_mov_b32_e32 v1, s7 5451; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5452; GFX7-NEXT: v_mov_b32_e32 v0, s4 5453; GFX7-NEXT: v_mov_b32_e32 v1, s5 5454; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5455; GFX7-NEXT: flat_store_dword v[0:1], v2 5456; GFX7-NEXT: s_endpgm 5457; 5458; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5459; GFX10-WGP: ; %bb.0: ; %entry 5460; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 5461; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5462; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 5463; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 5464; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 5465; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5466; GFX10-WGP-NEXT: s_mov_b32 s6, s4 5467; GFX10-WGP-NEXT: s_mov_b32 s7, s5 5468; GFX10-WGP-NEXT: s_mov_b32 s11, s12 5469; GFX10-WGP-NEXT: s_mov_b32 s10, s13 5470; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 5471; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 5472; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5473; GFX10-WGP-NEXT: s_mov_b32 s7, s10 5474; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 5475; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 5476; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5477; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5478; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5479; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 5480; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5481; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5482; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5483; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5484; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5485; GFX10-WGP-NEXT: s_endpgm 5486; 5487; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5488; GFX10-CU: ; %bb.0: ; %entry 5489; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 5490; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5491; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 5492; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 5493; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 5494; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5495; GFX10-CU-NEXT: s_mov_b32 s6, s4 5496; GFX10-CU-NEXT: s_mov_b32 s7, s5 5497; GFX10-CU-NEXT: s_mov_b32 s11, s12 5498; GFX10-CU-NEXT: s_mov_b32 s10, s13 5499; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 5500; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 5501; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5502; GFX10-CU-NEXT: s_mov_b32 s7, s10 5503; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 5504; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 5505; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5506; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5507; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5508; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 5509; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5510; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5511; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5512; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5513; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5514; GFX10-CU-NEXT: s_endpgm 5515; 5516; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5517; SKIP-CACHE-INV: ; %bb.0: ; %entry 5518; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 5519; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 5520; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 5521; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 5522; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 5523; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5524; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 5525; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 5526; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 5527; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 5528; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 5529; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 5530; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 5531; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 5532; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 5533; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5534; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5536; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5537; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5538; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5541; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5542; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5543; SKIP-CACHE-INV-NEXT: s_endpgm 5544; 5545; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5546; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5547; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5548; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5549; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5550; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5551; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5552; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5553; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5554; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5555; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5556; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5557; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5558; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5559; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5560; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5561; 5562; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5563; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5564; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5565; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5566; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5567; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5568; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5569; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5570; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5571; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5572; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5573; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5574; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5575; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5576; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5577; GFX90A-TGSPLIT-NEXT: s_endpgm 5578; 5579; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5580; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5581; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5582; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5583; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5584; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5585; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5586; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5587; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5588; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5589; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5590; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5591; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5592; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5593; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5594; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5595; 5596; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5597; GFX940-TGSPLIT: ; %bb.0: ; %entry 5598; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5599; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5600; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5601; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5602; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5603; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5604; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5605; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5606; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5607; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5608; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5609; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5610; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5611; GFX940-TGSPLIT-NEXT: s_endpgm 5612; 5613; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5614; GFX11-WGP: ; %bb.0: ; %entry 5615; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5616; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5617; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5618; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5619; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5620; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5621; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5622; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5623; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5624; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5625; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5626; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5627; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5628; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5629; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5630; GFX11-WGP-NEXT: s_endpgm 5631; 5632; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5633; GFX11-CU: ; %bb.0: ; %entry 5634; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5635; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5636; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5637; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5638; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5639; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5640; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5641; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5642; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5643; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5644; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5645; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5646; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5647; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5648; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5649; GFX11-CU-NEXT: s_endpgm 5650; 5651; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5652; GFX12-WGP: ; %bb.0: ; %entry 5653; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5654; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5655; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5656; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5657; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5658; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5659; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5660; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5661; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5662; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5663; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 5664; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5665; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5666; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 5667; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 5668; GFX12-WGP-NEXT: s_endpgm 5669; 5670; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: 5671; GFX12-CU: ; %bb.0: ; %entry 5672; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5673; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5674; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5675; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5676; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5677; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5678; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5679; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5680; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5681; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5682; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 5683; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5684; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5685; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 5686; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 5687; GFX12-CU-NEXT: s_endpgm 5688 ptr %out, i32 %in, i32 %old) { 5689entry: 5690 %gep = getelementptr i32, ptr %out, i32 4 5691 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic 5692 %val0 = extractvalue { i32, i1 } %val, 0 5693 store i32 %val0, ptr %out, align 4 5694 ret void 5695} 5696 5697define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( 5698; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5699; GFX7: ; %bb.0: ; %entry 5700; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 5701; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5702; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 5703; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 5704; GFX7-NEXT: s_mov_b64 s[12:13], 16 5705; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5706; GFX7-NEXT: s_mov_b32 s6, s4 5707; GFX7-NEXT: s_mov_b32 s7, s5 5708; GFX7-NEXT: s_mov_b32 s11, s12 5709; GFX7-NEXT: s_mov_b32 s10, s13 5710; GFX7-NEXT: s_add_u32 s6, s6, s11 5711; GFX7-NEXT: s_addc_u32 s10, s7, s10 5712; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5713; GFX7-NEXT: s_mov_b32 s7, s10 5714; GFX7-NEXT: v_mov_b32_e32 v2, s9 5715; GFX7-NEXT: v_mov_b32_e32 v0, s8 5716; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5717; GFX7-NEXT: v_mov_b32_e32 v3, v0 5718; GFX7-NEXT: v_mov_b32_e32 v0, s6 5719; GFX7-NEXT: v_mov_b32_e32 v1, s7 5720; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5721; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5722; GFX7-NEXT: v_mov_b32_e32 v0, s4 5723; GFX7-NEXT: v_mov_b32_e32 v1, s5 5724; GFX7-NEXT: s_waitcnt vmcnt(0) 5725; GFX7-NEXT: flat_store_dword v[0:1], v2 5726; GFX7-NEXT: s_endpgm 5727; 5728; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5729; GFX10-WGP: ; %bb.0: ; %entry 5730; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 5731; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5732; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 5733; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 5734; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 5735; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5736; GFX10-WGP-NEXT: s_mov_b32 s6, s4 5737; GFX10-WGP-NEXT: s_mov_b32 s7, s5 5738; GFX10-WGP-NEXT: s_mov_b32 s11, s12 5739; GFX10-WGP-NEXT: s_mov_b32 s10, s13 5740; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 5741; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 5742; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5743; GFX10-WGP-NEXT: s_mov_b32 s7, s10 5744; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 5745; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 5746; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5747; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5748; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5749; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 5750; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5751; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5752; GFX10-WGP-NEXT: buffer_gl0_inv 5753; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5754; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5755; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5756; GFX10-WGP-NEXT: s_endpgm 5757; 5758; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5759; GFX10-CU: ; %bb.0: ; %entry 5760; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 5761; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5762; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 5763; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 5764; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 5765; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5766; GFX10-CU-NEXT: s_mov_b32 s6, s4 5767; GFX10-CU-NEXT: s_mov_b32 s7, s5 5768; GFX10-CU-NEXT: s_mov_b32 s11, s12 5769; GFX10-CU-NEXT: s_mov_b32 s10, s13 5770; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 5771; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 5772; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5773; GFX10-CU-NEXT: s_mov_b32 s7, s10 5774; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 5775; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 5776; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5777; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5778; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5779; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 5780; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5781; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5782; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5783; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5784; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5785; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5786; GFX10-CU-NEXT: s_endpgm 5787; 5788; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5789; SKIP-CACHE-INV: ; %bb.0: ; %entry 5790; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 5791; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 5792; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 5793; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 5794; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 5795; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5796; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 5797; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 5798; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 5799; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 5800; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 5801; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 5802; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 5803; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 5804; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 5805; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5806; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5807; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5808; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5809; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5810; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5811; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5812; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5813; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5814; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5815; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5816; SKIP-CACHE-INV-NEXT: s_endpgm 5817; 5818; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5819; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5820; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5821; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5822; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5823; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5824; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5825; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5826; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5827; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5828; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5829; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5830; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5831; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5832; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5833; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5834; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5835; 5836; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5837; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5838; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5839; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5840; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5841; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5842; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5843; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5844; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5845; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5846; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5847; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5848; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5849; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 5850; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5851; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5852; GFX90A-TGSPLIT-NEXT: s_endpgm 5853; 5854; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5855; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5856; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5857; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5858; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5859; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5860; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5861; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5862; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5863; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5864; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5865; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5866; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5867; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5868; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 5869; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5870; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5871; 5872; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5873; GFX940-TGSPLIT: ; %bb.0: ; %entry 5874; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5875; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5876; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5877; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5878; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5879; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5880; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5881; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5882; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5883; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5884; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5885; GFX940-TGSPLIT-NEXT: buffer_inv sc0 5886; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5887; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5888; GFX940-TGSPLIT-NEXT: s_endpgm 5889; 5890; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5891; GFX11-WGP: ; %bb.0: ; %entry 5892; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5893; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5894; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5895; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5896; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5897; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5898; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5899; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5900; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5901; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5902; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5903; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5904; GFX11-WGP-NEXT: buffer_gl0_inv 5905; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5906; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5907; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5908; GFX11-WGP-NEXT: s_endpgm 5909; 5910; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5911; GFX11-CU: ; %bb.0: ; %entry 5912; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5913; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5914; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5915; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5916; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5917; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5918; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5919; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5920; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5921; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5922; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5923; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5924; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5925; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5926; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 5927; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5928; GFX11-CU-NEXT: s_endpgm 5929; 5930; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5931; GFX12-WGP: ; %bb.0: ; %entry 5932; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5933; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5934; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5935; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5936; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5937; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5938; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5939; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5940; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5941; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5942; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 5943; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 5944; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 5945; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5946; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5947; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 5948; GFX12-WGP-NEXT: s_endpgm 5949; 5950; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: 5951; GFX12-CU: ; %bb.0: ; %entry 5952; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5953; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5954; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5955; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5956; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5957; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5958; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5959; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5960; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5961; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5962; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 5963; GFX12-CU-NEXT: s_wait_dscnt 0x0 5964; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5965; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5966; GFX12-CU-NEXT: s_wait_loadcnt 0x0 5967; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 5968; GFX12-CU-NEXT: s_endpgm 5969 ptr %out, i32 %in, i32 %old) { 5970entry: 5971 %gep = getelementptr i32, ptr %out, i32 4 5972 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 5973 %val0 = extractvalue { i32, i1 } %val, 0 5974 store i32 %val0, ptr %out, align 4 5975 ret void 5976} 5977 5978define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( 5979; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 5980; GFX7: ; %bb.0: ; %entry 5981; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 5982; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5983; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 5984; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 5985; GFX7-NEXT: s_mov_b64 s[12:13], 16 5986; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5987; GFX7-NEXT: s_mov_b32 s6, s4 5988; GFX7-NEXT: s_mov_b32 s7, s5 5989; GFX7-NEXT: s_mov_b32 s11, s12 5990; GFX7-NEXT: s_mov_b32 s10, s13 5991; GFX7-NEXT: s_add_u32 s6, s6, s11 5992; GFX7-NEXT: s_addc_u32 s10, s7, s10 5993; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5994; GFX7-NEXT: s_mov_b32 s7, s10 5995; GFX7-NEXT: v_mov_b32_e32 v2, s9 5996; GFX7-NEXT: v_mov_b32_e32 v0, s8 5997; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5998; GFX7-NEXT: v_mov_b32_e32 v3, v0 5999; GFX7-NEXT: v_mov_b32_e32 v0, s6 6000; GFX7-NEXT: v_mov_b32_e32 v1, s7 6001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6002; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6003; GFX7-NEXT: v_mov_b32_e32 v0, s4 6004; GFX7-NEXT: v_mov_b32_e32 v1, s5 6005; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6006; GFX7-NEXT: flat_store_dword v[0:1], v2 6007; GFX7-NEXT: s_endpgm 6008; 6009; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6010; GFX10-WGP: ; %bb.0: ; %entry 6011; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6012; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6013; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6014; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6015; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6016; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6017; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6018; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6019; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6020; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6021; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6022; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6023; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6024; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6025; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6026; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6027; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6028; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6029; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6030; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6031; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6032; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6033; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6034; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6035; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6036; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6037; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6038; GFX10-WGP-NEXT: s_endpgm 6039; 6040; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6041; GFX10-CU: ; %bb.0: ; %entry 6042; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6043; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6044; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6045; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6046; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6047; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6048; GFX10-CU-NEXT: s_mov_b32 s6, s4 6049; GFX10-CU-NEXT: s_mov_b32 s7, s5 6050; GFX10-CU-NEXT: s_mov_b32 s11, s12 6051; GFX10-CU-NEXT: s_mov_b32 s10, s13 6052; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6053; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6054; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6055; GFX10-CU-NEXT: s_mov_b32 s7, s10 6056; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6057; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6058; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6059; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6060; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6061; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6062; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6063; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6064; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6065; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6066; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6067; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6068; GFX10-CU-NEXT: s_endpgm 6069; 6070; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6071; SKIP-CACHE-INV: ; %bb.0: ; %entry 6072; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6073; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6074; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6075; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6076; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6077; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6078; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6079; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6080; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6081; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6082; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6083; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6084; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6085; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6086; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6088; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6092; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6093; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6095; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6096; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6097; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6098; SKIP-CACHE-INV-NEXT: s_endpgm 6099; 6100; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6101; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6102; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6103; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6104; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6105; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6106; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6107; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6108; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6109; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6110; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6111; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6112; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6113; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6114; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6115; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6116; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6117; 6118; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6119; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6120; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6121; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6122; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6123; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6124; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6125; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6126; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6127; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6128; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6129; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6130; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6131; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6132; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6133; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6134; GFX90A-TGSPLIT-NEXT: s_endpgm 6135; 6136; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6137; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6138; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6139; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6140; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6141; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6142; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6143; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6144; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6145; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6146; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6147; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6148; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6149; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6150; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6151; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6152; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6153; 6154; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6155; GFX940-TGSPLIT: ; %bb.0: ; %entry 6156; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6157; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6158; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6159; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6160; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6161; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6162; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6163; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6164; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6165; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6166; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6167; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6168; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6169; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6170; GFX940-TGSPLIT-NEXT: s_endpgm 6171; 6172; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6173; GFX11-WGP: ; %bb.0: ; %entry 6174; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6175; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6176; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6177; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6178; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6179; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6180; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6181; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6182; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6183; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6184; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6185; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6186; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6187; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6188; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6189; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6190; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6191; GFX11-WGP-NEXT: s_endpgm 6192; 6193; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6194; GFX11-CU: ; %bb.0: ; %entry 6195; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6196; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6197; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6198; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6199; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6200; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6201; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6202; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6203; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6204; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6205; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6206; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6207; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6208; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6209; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6210; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6211; GFX11-CU-NEXT: s_endpgm 6212; 6213; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6214; GFX12-WGP: ; %bb.0: ; %entry 6215; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6216; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6217; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6218; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6219; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6220; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6221; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6222; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6223; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6224; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6225; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 6226; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 6227; GFX12-WGP-NEXT: s_wait_storecnt 0x0 6228; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6229; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 6230; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6231; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6232; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6233; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6234; GFX12-WGP-NEXT: s_endpgm 6235; 6236; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: 6237; GFX12-CU: ; %bb.0: ; %entry 6238; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6239; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6240; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6241; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6242; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6243; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6244; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6245; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6246; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6247; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6248; GFX12-CU-NEXT: s_wait_dscnt 0x0 6249; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6250; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6251; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6252; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 6253; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6254; GFX12-CU-NEXT: s_endpgm 6255 ptr %out, i32 %in, i32 %old) { 6256entry: 6257 %gep = getelementptr i32, ptr %out, i32 4 6258 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic 6259 %val0 = extractvalue { i32, i1 } %val, 0 6260 store i32 %val0, ptr %out, align 4 6261 ret void 6262} 6263 6264define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( 6265; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6266; GFX7: ; %bb.0: ; %entry 6267; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6268; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6269; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6270; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6271; GFX7-NEXT: s_mov_b64 s[12:13], 16 6272; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6273; GFX7-NEXT: s_mov_b32 s6, s4 6274; GFX7-NEXT: s_mov_b32 s7, s5 6275; GFX7-NEXT: s_mov_b32 s11, s12 6276; GFX7-NEXT: s_mov_b32 s10, s13 6277; GFX7-NEXT: s_add_u32 s6, s6, s11 6278; GFX7-NEXT: s_addc_u32 s10, s7, s10 6279; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6280; GFX7-NEXT: s_mov_b32 s7, s10 6281; GFX7-NEXT: v_mov_b32_e32 v2, s9 6282; GFX7-NEXT: v_mov_b32_e32 v0, s8 6283; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6284; GFX7-NEXT: v_mov_b32_e32 v3, v0 6285; GFX7-NEXT: v_mov_b32_e32 v0, s6 6286; GFX7-NEXT: v_mov_b32_e32 v1, s7 6287; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6288; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6289; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6290; GFX7-NEXT: v_mov_b32_e32 v0, s4 6291; GFX7-NEXT: v_mov_b32_e32 v1, s5 6292; GFX7-NEXT: s_waitcnt vmcnt(0) 6293; GFX7-NEXT: flat_store_dword v[0:1], v2 6294; GFX7-NEXT: s_endpgm 6295; 6296; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6297; GFX10-WGP: ; %bb.0: ; %entry 6298; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6299; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6300; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6301; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6302; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6303; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6304; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6305; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6306; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6307; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6308; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6309; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6310; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6311; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6312; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6313; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6314; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6315; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6316; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6317; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6318; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6319; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6320; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6321; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6322; GFX10-WGP-NEXT: buffer_gl0_inv 6323; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6324; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6325; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6326; GFX10-WGP-NEXT: s_endpgm 6327; 6328; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6329; GFX10-CU: ; %bb.0: ; %entry 6330; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6331; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6332; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6333; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6334; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6335; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6336; GFX10-CU-NEXT: s_mov_b32 s6, s4 6337; GFX10-CU-NEXT: s_mov_b32 s7, s5 6338; GFX10-CU-NEXT: s_mov_b32 s11, s12 6339; GFX10-CU-NEXT: s_mov_b32 s10, s13 6340; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6341; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6342; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6343; GFX10-CU-NEXT: s_mov_b32 s7, s10 6344; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6345; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6346; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6347; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6348; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6349; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6350; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6351; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6352; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6353; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6354; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6355; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6356; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6357; GFX10-CU-NEXT: s_endpgm 6358; 6359; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6360; SKIP-CACHE-INV: ; %bb.0: ; %entry 6361; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6362; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6363; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6364; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6365; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6366; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6367; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6368; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6369; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6370; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6371; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6372; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6373; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6374; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6375; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6377; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6378; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6379; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6380; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6381; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6382; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6383; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6384; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6385; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6386; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6387; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6388; SKIP-CACHE-INV-NEXT: s_endpgm 6389; 6390; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6391; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6392; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6393; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6394; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6395; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6396; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6397; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6398; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6399; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6400; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6401; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6402; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6403; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6404; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6405; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6406; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6407; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6408; 6409; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6410; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6411; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6412; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6413; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6414; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6415; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6416; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6417; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6418; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6419; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6420; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6421; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6422; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6423; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6424; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6425; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6426; GFX90A-TGSPLIT-NEXT: s_endpgm 6427; 6428; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6429; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6430; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6431; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6432; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6433; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6434; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6435; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6436; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6437; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6438; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6439; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6440; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6441; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6442; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6443; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6444; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6445; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6446; 6447; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6448; GFX940-TGSPLIT: ; %bb.0: ; %entry 6449; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6450; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6451; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6452; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6453; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6454; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6455; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6456; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6457; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6458; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6459; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6460; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6461; GFX940-TGSPLIT-NEXT: buffer_inv sc0 6462; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6463; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6464; GFX940-TGSPLIT-NEXT: s_endpgm 6465; 6466; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6467; GFX11-WGP: ; %bb.0: ; %entry 6468; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6469; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6470; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6471; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6472; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6473; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6474; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6475; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6476; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6477; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6478; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6479; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6480; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6481; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6482; GFX11-WGP-NEXT: buffer_gl0_inv 6483; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6484; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6485; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6486; GFX11-WGP-NEXT: s_endpgm 6487; 6488; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6489; GFX11-CU: ; %bb.0: ; %entry 6490; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6491; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6492; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6493; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6494; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6495; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6496; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6497; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6498; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6499; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6500; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6501; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6502; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6503; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6504; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6505; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 6506; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6507; GFX11-CU-NEXT: s_endpgm 6508; 6509; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6510; GFX12-WGP: ; %bb.0: ; %entry 6511; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6512; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6513; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6514; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6515; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6516; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6517; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6518; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6519; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6520; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6521; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 6522; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 6523; GFX12-WGP-NEXT: s_wait_storecnt 0x0 6524; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6525; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 6526; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 6527; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 6528; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6529; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 6530; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6531; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6532; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6533; GFX12-WGP-NEXT: s_endpgm 6534; 6535; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: 6536; GFX12-CU: ; %bb.0: ; %entry 6537; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6538; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6539; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6540; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6541; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6542; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6543; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6544; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6545; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6546; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6547; GFX12-CU-NEXT: s_wait_dscnt 0x0 6548; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6549; GFX12-CU-NEXT: s_wait_dscnt 0x0 6550; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6551; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6552; GFX12-CU-NEXT: s_wait_loadcnt 0x0 6553; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6554; GFX12-CU-NEXT: s_endpgm 6555 ptr %out, i32 %in, i32 %old) { 6556entry: 6557 %gep = getelementptr i32, ptr %out, i32 4 6558 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 6559 %val0 = extractvalue { i32, i1 } %val, 0 6560 store i32 %val0, ptr %out, align 4 6561 ret void 6562} 6563 6564define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( 6565; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6566; GFX7: ; %bb.0: ; %entry 6567; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6568; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6569; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6570; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6571; GFX7-NEXT: s_mov_b64 s[12:13], 16 6572; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6573; GFX7-NEXT: s_mov_b32 s6, s4 6574; GFX7-NEXT: s_mov_b32 s7, s5 6575; GFX7-NEXT: s_mov_b32 s11, s12 6576; GFX7-NEXT: s_mov_b32 s10, s13 6577; GFX7-NEXT: s_add_u32 s6, s6, s11 6578; GFX7-NEXT: s_addc_u32 s10, s7, s10 6579; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6580; GFX7-NEXT: s_mov_b32 s7, s10 6581; GFX7-NEXT: v_mov_b32_e32 v2, s9 6582; GFX7-NEXT: v_mov_b32_e32 v0, s8 6583; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6584; GFX7-NEXT: v_mov_b32_e32 v3, v0 6585; GFX7-NEXT: v_mov_b32_e32 v0, s6 6586; GFX7-NEXT: v_mov_b32_e32 v1, s7 6587; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6588; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6589; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6590; GFX7-NEXT: v_mov_b32_e32 v0, s4 6591; GFX7-NEXT: v_mov_b32_e32 v1, s5 6592; GFX7-NEXT: s_waitcnt vmcnt(0) 6593; GFX7-NEXT: flat_store_dword v[0:1], v2 6594; GFX7-NEXT: s_endpgm 6595; 6596; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6597; GFX10-WGP: ; %bb.0: ; %entry 6598; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6599; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6600; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6601; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6602; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6603; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6604; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6605; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6606; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6607; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6608; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6609; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6610; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6611; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6612; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6613; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6614; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6615; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6616; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6617; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6618; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6619; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6620; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6621; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6622; GFX10-WGP-NEXT: buffer_gl0_inv 6623; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6624; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6625; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6626; GFX10-WGP-NEXT: s_endpgm 6627; 6628; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6629; GFX10-CU: ; %bb.0: ; %entry 6630; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6631; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6632; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6633; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6634; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6635; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6636; GFX10-CU-NEXT: s_mov_b32 s6, s4 6637; GFX10-CU-NEXT: s_mov_b32 s7, s5 6638; GFX10-CU-NEXT: s_mov_b32 s11, s12 6639; GFX10-CU-NEXT: s_mov_b32 s10, s13 6640; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6641; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6642; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6643; GFX10-CU-NEXT: s_mov_b32 s7, s10 6644; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6645; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6646; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6647; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6648; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6649; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6650; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6651; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6652; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6653; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6654; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6655; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6656; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6657; GFX10-CU-NEXT: s_endpgm 6658; 6659; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6660; SKIP-CACHE-INV: ; %bb.0: ; %entry 6661; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6662; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6663; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6664; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6665; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6666; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6667; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6668; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6669; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6670; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6671; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6672; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6673; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6674; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6675; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6676; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6677; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6678; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6679; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6680; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6681; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6682; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6683; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6684; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6685; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6686; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6687; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6688; SKIP-CACHE-INV-NEXT: s_endpgm 6689; 6690; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6691; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6692; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6693; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6694; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6695; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6696; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6697; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6698; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6699; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6700; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6701; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6702; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6703; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6704; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6705; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6706; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6707; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6708; 6709; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6710; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6711; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6712; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6713; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6714; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6715; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6716; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6717; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6718; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6719; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6720; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6721; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6722; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6723; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 6724; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6725; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6726; GFX90A-TGSPLIT-NEXT: s_endpgm 6727; 6728; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6729; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6730; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6731; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6732; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6733; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6734; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6735; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6736; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6737; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6738; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6739; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6740; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6741; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6742; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6743; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 6744; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6745; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6746; 6747; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6748; GFX940-TGSPLIT: ; %bb.0: ; %entry 6749; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6750; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6751; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6752; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6753; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6754; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6755; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6756; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6757; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6758; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6759; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6760; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6761; GFX940-TGSPLIT-NEXT: buffer_inv sc0 6762; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6763; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6764; GFX940-TGSPLIT-NEXT: s_endpgm 6765; 6766; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6767; GFX11-WGP: ; %bb.0: ; %entry 6768; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6769; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6770; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6771; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6772; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6773; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6774; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6775; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6776; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6777; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6778; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6779; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 6780; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6781; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6782; GFX11-WGP-NEXT: buffer_gl0_inv 6783; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6784; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6785; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6786; GFX11-WGP-NEXT: s_endpgm 6787; 6788; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6789; GFX11-CU: ; %bb.0: ; %entry 6790; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6791; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6792; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6793; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6794; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6795; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6796; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6797; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6798; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6799; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6800; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6801; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6802; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6803; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6804; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6805; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 6806; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6807; GFX11-CU-NEXT: s_endpgm 6808; 6809; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6810; GFX12-WGP: ; %bb.0: ; %entry 6811; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6812; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6813; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6814; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6815; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6816; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6817; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6818; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6819; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6820; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6821; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 6822; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 6823; GFX12-WGP-NEXT: s_wait_storecnt 0x0 6824; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6825; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 6826; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 6827; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 6828; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6829; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 6830; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6831; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6832; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6833; GFX12-WGP-NEXT: s_endpgm 6834; 6835; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: 6836; GFX12-CU: ; %bb.0: ; %entry 6837; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6838; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6839; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6840; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6841; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6842; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6843; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6844; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6845; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6846; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6847; GFX12-CU-NEXT: s_wait_dscnt 0x0 6848; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6849; GFX12-CU-NEXT: s_wait_dscnt 0x0 6850; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6851; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6852; GFX12-CU-NEXT: s_wait_loadcnt 0x0 6853; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6854; GFX12-CU-NEXT: s_endpgm 6855 ptr %out, i32 %in, i32 %old) { 6856entry: 6857 %gep = getelementptr i32, ptr %out, i32 4 6858 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 6859 %val0 = extractvalue { i32, i1 } %val, 0 6860 store i32 %val0, ptr %out, align 4 6861 ret void 6862} 6863 6864define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( 6865; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 6866; GFX7: ; %bb.0: ; %entry 6867; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6868; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6869; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6870; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6871; GFX7-NEXT: s_mov_b64 s[12:13], 16 6872; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6873; GFX7-NEXT: s_mov_b32 s6, s4 6874; GFX7-NEXT: s_mov_b32 s7, s5 6875; GFX7-NEXT: s_mov_b32 s11, s12 6876; GFX7-NEXT: s_mov_b32 s10, s13 6877; GFX7-NEXT: s_add_u32 s6, s6, s11 6878; GFX7-NEXT: s_addc_u32 s10, s7, s10 6879; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6880; GFX7-NEXT: s_mov_b32 s7, s10 6881; GFX7-NEXT: v_mov_b32_e32 v2, s9 6882; GFX7-NEXT: v_mov_b32_e32 v0, s8 6883; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6884; GFX7-NEXT: v_mov_b32_e32 v3, v0 6885; GFX7-NEXT: v_mov_b32_e32 v0, s6 6886; GFX7-NEXT: v_mov_b32_e32 v1, s7 6887; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6888; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6889; GFX7-NEXT: v_mov_b32_e32 v0, s4 6890; GFX7-NEXT: v_mov_b32_e32 v1, s5 6891; GFX7-NEXT: s_waitcnt vmcnt(0) 6892; GFX7-NEXT: flat_store_dword v[0:1], v2 6893; GFX7-NEXT: s_endpgm 6894; 6895; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 6896; GFX10-WGP: ; %bb.0: ; %entry 6897; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6898; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6899; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6900; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6901; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6902; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6903; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6904; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6905; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6906; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6907; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6908; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6909; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6910; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6911; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6912; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6913; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6914; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6915; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6916; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6917; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6918; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6919; GFX10-WGP-NEXT: buffer_gl0_inv 6920; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6921; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6922; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6923; GFX10-WGP-NEXT: s_endpgm 6924; 6925; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 6926; GFX10-CU: ; %bb.0: ; %entry 6927; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6928; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6929; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6930; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6931; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6932; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6933; GFX10-CU-NEXT: s_mov_b32 s6, s4 6934; GFX10-CU-NEXT: s_mov_b32 s7, s5 6935; GFX10-CU-NEXT: s_mov_b32 s11, s12 6936; GFX10-CU-NEXT: s_mov_b32 s10, s13 6937; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6938; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6939; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6940; GFX10-CU-NEXT: s_mov_b32 s7, s10 6941; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6942; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6943; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6944; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6945; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6946; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6947; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6948; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6949; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6950; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6951; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 6952; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6953; GFX10-CU-NEXT: s_endpgm 6954; 6955; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 6956; SKIP-CACHE-INV: ; %bb.0: ; %entry 6957; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6958; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6959; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6960; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6961; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6962; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6963; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6964; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6965; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6966; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6967; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6968; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6969; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6970; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6971; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6972; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6973; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6974; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6975; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6976; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6977; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6978; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6979; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6980; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6981; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 6982; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6983; SKIP-CACHE-INV-NEXT: s_endpgm 6984; 6985; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 6986; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6987; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6988; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6989; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6990; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6991; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6992; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6993; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6994; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6995; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6996; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6997; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6998; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6999; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7000; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7001; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7002; 7003; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7004; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7005; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7006; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7007; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7008; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7009; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7010; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7011; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7012; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7013; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7014; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7015; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7016; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7017; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7018; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7019; GFX90A-TGSPLIT-NEXT: s_endpgm 7020; 7021; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7022; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7023; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7024; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7025; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7026; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7027; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7028; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7029; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7030; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7031; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7032; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7033; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7034; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7035; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7036; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7037; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7038; 7039; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7040; GFX940-TGSPLIT: ; %bb.0: ; %entry 7041; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7042; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7043; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7044; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7045; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7046; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7047; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7048; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7049; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7050; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7051; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7052; GFX940-TGSPLIT-NEXT: buffer_inv sc0 7053; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7054; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7055; GFX940-TGSPLIT-NEXT: s_endpgm 7056; 7057; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7058; GFX11-WGP: ; %bb.0: ; %entry 7059; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7060; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7061; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7062; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7063; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7064; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7065; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7066; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7067; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7068; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7069; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7070; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7071; GFX11-WGP-NEXT: buffer_gl0_inv 7072; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7073; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7074; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7075; GFX11-WGP-NEXT: s_endpgm 7076; 7077; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7078; GFX11-CU: ; %bb.0: ; %entry 7079; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7080; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7081; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7082; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7083; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7084; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7085; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7086; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7087; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7088; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7089; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7090; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7091; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7092; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7093; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 7094; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7095; GFX11-CU-NEXT: s_endpgm 7096; 7097; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7098; GFX12-WGP: ; %bb.0: ; %entry 7099; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7100; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7101; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7102; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7103; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7104; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7105; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7106; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7107; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7108; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7109; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 7110; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 7111; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 7112; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7113; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 7114; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7115; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7116; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7117; GFX12-WGP-NEXT: s_endpgm 7118; 7119; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: 7120; GFX12-CU: ; %bb.0: ; %entry 7121; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7122; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7123; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7124; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7125; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7126; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7127; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7128; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7129; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7130; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7131; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7132; GFX12-CU-NEXT: s_wait_dscnt 0x0 7133; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7134; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7135; GFX12-CU-NEXT: s_wait_loadcnt 0x0 7136; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7137; GFX12-CU-NEXT: s_endpgm 7138 ptr %out, i32 %in, i32 %old) { 7139entry: 7140 %gep = getelementptr i32, ptr %out, i32 4 7141 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire 7142 %val0 = extractvalue { i32, i1 } %val, 0 7143 store i32 %val0, ptr %out, align 4 7144 ret void 7145} 7146 7147define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( 7148; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7149; GFX7: ; %bb.0: ; %entry 7150; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7151; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7152; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7153; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7154; GFX7-NEXT: s_mov_b64 s[12:13], 16 7155; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7156; GFX7-NEXT: s_mov_b32 s6, s4 7157; GFX7-NEXT: s_mov_b32 s7, s5 7158; GFX7-NEXT: s_mov_b32 s11, s12 7159; GFX7-NEXT: s_mov_b32 s10, s13 7160; GFX7-NEXT: s_add_u32 s6, s6, s11 7161; GFX7-NEXT: s_addc_u32 s10, s7, s10 7162; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7163; GFX7-NEXT: s_mov_b32 s7, s10 7164; GFX7-NEXT: v_mov_b32_e32 v2, s9 7165; GFX7-NEXT: v_mov_b32_e32 v0, s8 7166; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7167; GFX7-NEXT: v_mov_b32_e32 v3, v0 7168; GFX7-NEXT: v_mov_b32_e32 v0, s6 7169; GFX7-NEXT: v_mov_b32_e32 v1, s7 7170; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7171; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7172; GFX7-NEXT: v_mov_b32_e32 v0, s4 7173; GFX7-NEXT: v_mov_b32_e32 v1, s5 7174; GFX7-NEXT: s_waitcnt vmcnt(0) 7175; GFX7-NEXT: flat_store_dword v[0:1], v2 7176; GFX7-NEXT: s_endpgm 7177; 7178; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7179; GFX10-WGP: ; %bb.0: ; %entry 7180; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7181; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7182; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7183; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7184; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7185; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7186; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7187; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7188; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7189; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7190; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7191; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7192; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7193; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7194; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7195; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7196; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7197; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7198; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7199; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7200; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7201; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7202; GFX10-WGP-NEXT: buffer_gl0_inv 7203; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7204; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7205; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7206; GFX10-WGP-NEXT: s_endpgm 7207; 7208; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7209; GFX10-CU: ; %bb.0: ; %entry 7210; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7211; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7212; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7213; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7214; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7215; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7216; GFX10-CU-NEXT: s_mov_b32 s6, s4 7217; GFX10-CU-NEXT: s_mov_b32 s7, s5 7218; GFX10-CU-NEXT: s_mov_b32 s11, s12 7219; GFX10-CU-NEXT: s_mov_b32 s10, s13 7220; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7221; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7222; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7223; GFX10-CU-NEXT: s_mov_b32 s7, s10 7224; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7225; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7226; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7227; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7228; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7229; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7230; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7231; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7232; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7233; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7234; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7235; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7236; GFX10-CU-NEXT: s_endpgm 7237; 7238; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7239; SKIP-CACHE-INV: ; %bb.0: ; %entry 7240; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7241; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7242; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7243; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7244; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7245; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7246; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7247; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7248; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7249; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7250; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7251; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7252; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7253; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7255; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7256; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7257; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7258; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7259; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7260; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7261; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7262; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7263; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7264; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7265; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7266; SKIP-CACHE-INV-NEXT: s_endpgm 7267; 7268; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7269; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7270; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7271; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7272; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7273; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7274; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7275; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7276; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7277; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7278; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7279; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7280; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7281; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7282; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7283; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7284; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7285; 7286; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7287; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7288; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7289; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7290; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7291; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7292; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7293; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7294; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7295; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7296; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7297; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7298; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7299; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7300; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7301; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7302; GFX90A-TGSPLIT-NEXT: s_endpgm 7303; 7304; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7305; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7306; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7307; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7308; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7309; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7310; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7311; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7312; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7313; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7314; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7315; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7316; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7317; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7318; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7319; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7320; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7321; 7322; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7323; GFX940-TGSPLIT: ; %bb.0: ; %entry 7324; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7325; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7326; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7327; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7328; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7329; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7330; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7331; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7332; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7333; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7334; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7335; GFX940-TGSPLIT-NEXT: buffer_inv sc0 7336; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7337; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7338; GFX940-TGSPLIT-NEXT: s_endpgm 7339; 7340; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7341; GFX11-WGP: ; %bb.0: ; %entry 7342; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7343; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7344; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7345; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7346; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7347; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7348; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7349; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7350; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7351; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7352; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7353; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7354; GFX11-WGP-NEXT: buffer_gl0_inv 7355; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7356; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7357; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7358; GFX11-WGP-NEXT: s_endpgm 7359; 7360; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7361; GFX11-CU: ; %bb.0: ; %entry 7362; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7363; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7364; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7365; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7366; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7367; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7368; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7369; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7370; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7371; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7372; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7373; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7374; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7375; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7376; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 7377; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7378; GFX11-CU-NEXT: s_endpgm 7379; 7380; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7381; GFX12-WGP: ; %bb.0: ; %entry 7382; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7383; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7384; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7385; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7386; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7387; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7388; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7389; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7390; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7391; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7392; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 7393; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7394; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 7395; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7396; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7397; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7398; GFX12-WGP-NEXT: s_endpgm 7399; 7400; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: 7401; GFX12-CU: ; %bb.0: ; %entry 7402; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7403; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7404; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7405; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7406; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7407; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7408; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7409; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7410; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7411; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7412; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7413; GFX12-CU-NEXT: s_wait_dscnt 0x0 7414; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7415; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7416; GFX12-CU-NEXT: s_wait_loadcnt 0x0 7417; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7418; GFX12-CU-NEXT: s_endpgm 7419 ptr %out, i32 %in, i32 %old) { 7420entry: 7421 %gep = getelementptr i32, ptr %out, i32 4 7422 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 7423 %val0 = extractvalue { i32, i1 } %val, 0 7424 store i32 %val0, ptr %out, align 4 7425 ret void 7426} 7427 7428define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( 7429; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7430; GFX7: ; %bb.0: ; %entry 7431; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7432; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7433; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7434; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7435; GFX7-NEXT: s_mov_b64 s[12:13], 16 7436; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7437; GFX7-NEXT: s_mov_b32 s6, s4 7438; GFX7-NEXT: s_mov_b32 s7, s5 7439; GFX7-NEXT: s_mov_b32 s11, s12 7440; GFX7-NEXT: s_mov_b32 s10, s13 7441; GFX7-NEXT: s_add_u32 s6, s6, s11 7442; GFX7-NEXT: s_addc_u32 s10, s7, s10 7443; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7444; GFX7-NEXT: s_mov_b32 s7, s10 7445; GFX7-NEXT: v_mov_b32_e32 v2, s9 7446; GFX7-NEXT: v_mov_b32_e32 v0, s8 7447; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7448; GFX7-NEXT: v_mov_b32_e32 v3, v0 7449; GFX7-NEXT: v_mov_b32_e32 v0, s6 7450; GFX7-NEXT: v_mov_b32_e32 v1, s7 7451; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7452; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7454; GFX7-NEXT: v_mov_b32_e32 v0, s4 7455; GFX7-NEXT: v_mov_b32_e32 v1, s5 7456; GFX7-NEXT: s_waitcnt vmcnt(0) 7457; GFX7-NEXT: flat_store_dword v[0:1], v2 7458; GFX7-NEXT: s_endpgm 7459; 7460; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7461; GFX10-WGP: ; %bb.0: ; %entry 7462; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7463; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7464; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7465; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7466; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7467; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7468; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7469; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7470; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7471; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7472; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7473; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7474; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7475; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7476; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7477; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7478; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7479; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7480; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7481; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7482; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7483; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7484; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7485; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7486; GFX10-WGP-NEXT: buffer_gl0_inv 7487; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7488; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7489; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7490; GFX10-WGP-NEXT: s_endpgm 7491; 7492; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7493; GFX10-CU: ; %bb.0: ; %entry 7494; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7495; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7496; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7497; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7498; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7499; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7500; GFX10-CU-NEXT: s_mov_b32 s6, s4 7501; GFX10-CU-NEXT: s_mov_b32 s7, s5 7502; GFX10-CU-NEXT: s_mov_b32 s11, s12 7503; GFX10-CU-NEXT: s_mov_b32 s10, s13 7504; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7505; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7506; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7507; GFX10-CU-NEXT: s_mov_b32 s7, s10 7508; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7509; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7510; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7511; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7512; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7513; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7514; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7515; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7516; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7517; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7518; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7519; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7520; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7521; GFX10-CU-NEXT: s_endpgm 7522; 7523; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7524; SKIP-CACHE-INV: ; %bb.0: ; %entry 7525; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7526; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7527; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7528; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7529; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7530; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7531; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7532; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7533; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7534; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7535; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7536; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7537; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7538; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7541; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7542; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7543; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7544; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7545; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7546; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7547; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7548; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7549; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7550; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7551; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7552; SKIP-CACHE-INV-NEXT: s_endpgm 7553; 7554; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7555; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7556; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7557; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7558; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7559; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7560; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7561; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7562; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7563; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7564; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7565; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7566; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7567; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7568; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7569; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7570; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7571; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7572; 7573; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7574; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7575; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7576; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7577; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7578; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7579; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7580; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7581; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7582; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7583; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7584; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7585; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7586; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7587; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7588; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7589; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7590; GFX90A-TGSPLIT-NEXT: s_endpgm 7591; 7592; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7593; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7594; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7595; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7596; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7597; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7598; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7599; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7600; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7601; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7602; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7603; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7604; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7605; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7606; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7607; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7608; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7609; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7610; 7611; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7612; GFX940-TGSPLIT: ; %bb.0: ; %entry 7613; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7614; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7615; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7616; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7617; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7618; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7619; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7620; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7621; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7622; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7623; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7624; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7625; GFX940-TGSPLIT-NEXT: buffer_inv sc0 7626; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7627; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7628; GFX940-TGSPLIT-NEXT: s_endpgm 7629; 7630; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7631; GFX11-WGP: ; %bb.0: ; %entry 7632; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7633; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7634; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7635; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7636; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7637; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7638; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7639; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7640; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7641; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7642; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7643; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7644; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7645; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7646; GFX11-WGP-NEXT: buffer_gl0_inv 7647; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7648; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7649; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7650; GFX11-WGP-NEXT: s_endpgm 7651; 7652; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7653; GFX11-CU: ; %bb.0: ; %entry 7654; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7655; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7656; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7657; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7658; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7659; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7660; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7661; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7662; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7663; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7664; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7665; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7666; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7667; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7668; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7669; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 7670; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7671; GFX11-CU-NEXT: s_endpgm 7672; 7673; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7674; GFX12-WGP: ; %bb.0: ; %entry 7675; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7676; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7677; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7678; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7679; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7680; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7681; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7682; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7683; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7684; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7685; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 7686; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 7687; GFX12-WGP-NEXT: s_wait_storecnt 0x0 7688; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7689; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 7690; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 7691; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 7692; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7693; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 7694; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7695; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7696; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7697; GFX12-WGP-NEXT: s_endpgm 7698; 7699; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: 7700; GFX12-CU: ; %bb.0: ; %entry 7701; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7702; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7703; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7704; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7705; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7706; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7707; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7708; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7709; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7710; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7711; GFX12-CU-NEXT: s_wait_dscnt 0x0 7712; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7713; GFX12-CU-NEXT: s_wait_dscnt 0x0 7714; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7715; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7716; GFX12-CU-NEXT: s_wait_loadcnt 0x0 7717; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7718; GFX12-CU-NEXT: s_endpgm 7719 ptr %out, i32 %in, i32 %old) { 7720entry: 7721 %gep = getelementptr i32, ptr %out, i32 4 7722 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 7723 %val0 = extractvalue { i32, i1 } %val, 0 7724 store i32 %val0, ptr %out, align 4 7725 ret void 7726} 7727 7728define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( 7729; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7730; GFX7: ; %bb.0: ; %entry 7731; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7732; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7733; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7734; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7735; GFX7-NEXT: s_mov_b64 s[12:13], 16 7736; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7737; GFX7-NEXT: s_mov_b32 s6, s4 7738; GFX7-NEXT: s_mov_b32 s7, s5 7739; GFX7-NEXT: s_mov_b32 s11, s12 7740; GFX7-NEXT: s_mov_b32 s10, s13 7741; GFX7-NEXT: s_add_u32 s6, s6, s11 7742; GFX7-NEXT: s_addc_u32 s10, s7, s10 7743; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7744; GFX7-NEXT: s_mov_b32 s7, s10 7745; GFX7-NEXT: v_mov_b32_e32 v2, s9 7746; GFX7-NEXT: v_mov_b32_e32 v0, s8 7747; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7748; GFX7-NEXT: v_mov_b32_e32 v3, v0 7749; GFX7-NEXT: v_mov_b32_e32 v0, s6 7750; GFX7-NEXT: v_mov_b32_e32 v1, s7 7751; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7752; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7753; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7754; GFX7-NEXT: v_mov_b32_e32 v0, s4 7755; GFX7-NEXT: v_mov_b32_e32 v1, s5 7756; GFX7-NEXT: s_waitcnt vmcnt(0) 7757; GFX7-NEXT: flat_store_dword v[0:1], v2 7758; GFX7-NEXT: s_endpgm 7759; 7760; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7761; GFX10-WGP: ; %bb.0: ; %entry 7762; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7763; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7764; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7765; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7766; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7767; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7768; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7769; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7770; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7771; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7772; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7773; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7774; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7775; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7776; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7777; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7778; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7779; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7780; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7781; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7782; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7783; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7784; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7785; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7786; GFX10-WGP-NEXT: buffer_gl0_inv 7787; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7788; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7789; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7790; GFX10-WGP-NEXT: s_endpgm 7791; 7792; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7793; GFX10-CU: ; %bb.0: ; %entry 7794; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7795; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7796; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7797; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7798; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7799; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7800; GFX10-CU-NEXT: s_mov_b32 s6, s4 7801; GFX10-CU-NEXT: s_mov_b32 s7, s5 7802; GFX10-CU-NEXT: s_mov_b32 s11, s12 7803; GFX10-CU-NEXT: s_mov_b32 s10, s13 7804; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7805; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7806; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7807; GFX10-CU-NEXT: s_mov_b32 s7, s10 7808; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7809; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7810; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7811; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7812; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7813; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7814; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7815; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7816; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7817; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7818; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7819; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 7820; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7821; GFX10-CU-NEXT: s_endpgm 7822; 7823; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7824; SKIP-CACHE-INV: ; %bb.0: ; %entry 7825; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7826; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7827; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7828; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7829; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7830; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7831; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7832; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7833; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7834; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7835; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7836; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7837; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7838; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7839; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7840; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7841; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7842; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7843; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7844; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7845; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7846; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7847; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7848; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7849; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7850; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 7851; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7852; SKIP-CACHE-INV-NEXT: s_endpgm 7853; 7854; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7855; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7856; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7857; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7858; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7859; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7860; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7861; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7862; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7863; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7864; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7865; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7866; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7867; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7868; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7869; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7870; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7871; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7872; 7873; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7874; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7875; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7876; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7877; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7878; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7879; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7880; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7881; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7882; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7883; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7884; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7885; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7886; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7887; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 7888; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7889; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7890; GFX90A-TGSPLIT-NEXT: s_endpgm 7891; 7892; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7893; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7894; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7895; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7896; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7897; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7898; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7899; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7900; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7901; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7902; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7903; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7904; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7905; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7906; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7907; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 7908; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7909; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7910; 7911; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7912; GFX940-TGSPLIT: ; %bb.0: ; %entry 7913; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7914; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7915; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7916; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7917; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7918; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7919; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7920; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7921; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7922; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7923; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7924; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7925; GFX940-TGSPLIT-NEXT: buffer_inv sc0 7926; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7927; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7928; GFX940-TGSPLIT-NEXT: s_endpgm 7929; 7930; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7931; GFX11-WGP: ; %bb.0: ; %entry 7932; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7933; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7934; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7935; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7936; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7937; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7938; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7939; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7940; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7941; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7942; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7943; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 7944; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7945; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7946; GFX11-WGP-NEXT: buffer_gl0_inv 7947; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7948; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7949; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7950; GFX11-WGP-NEXT: s_endpgm 7951; 7952; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7953; GFX11-CU: ; %bb.0: ; %entry 7954; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7955; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7956; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7957; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7958; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7959; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7960; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7961; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7962; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7963; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7964; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7965; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7966; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7967; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7968; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7969; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 7970; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7971; GFX11-CU-NEXT: s_endpgm 7972; 7973; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 7974; GFX12-WGP: ; %bb.0: ; %entry 7975; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7976; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7977; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7978; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7979; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7980; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7981; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7982; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7983; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7984; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7985; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 7986; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 7987; GFX12-WGP-NEXT: s_wait_storecnt 0x0 7988; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7989; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 7990; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 7991; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 7992; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7993; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 7994; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7995; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7996; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7997; GFX12-WGP-NEXT: s_endpgm 7998; 7999; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: 8000; GFX12-CU: ; %bb.0: ; %entry 8001; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8002; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8003; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8004; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8005; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8006; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8007; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8008; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8009; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8010; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8011; GFX12-CU-NEXT: s_wait_dscnt 0x0 8012; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8013; GFX12-CU-NEXT: s_wait_dscnt 0x0 8014; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8015; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8016; GFX12-CU-NEXT: s_wait_loadcnt 0x0 8017; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8018; GFX12-CU-NEXT: s_endpgm 8019 ptr %out, i32 %in, i32 %old) { 8020entry: 8021 %gep = getelementptr i32, ptr %out, i32 4 8022 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 8023 %val0 = extractvalue { i32, i1 } %val, 0 8024 store i32 %val0, ptr %out, align 4 8025 ret void 8026} 8027 8028define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( 8029; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8030; GFX7: ; %bb.0: ; %entry 8031; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8032; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8033; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8034; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8035; GFX7-NEXT: s_mov_b64 s[12:13], 16 8036; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8037; GFX7-NEXT: s_mov_b32 s6, s4 8038; GFX7-NEXT: s_mov_b32 s7, s5 8039; GFX7-NEXT: s_mov_b32 s11, s12 8040; GFX7-NEXT: s_mov_b32 s10, s13 8041; GFX7-NEXT: s_add_u32 s6, s6, s11 8042; GFX7-NEXT: s_addc_u32 s10, s7, s10 8043; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8044; GFX7-NEXT: s_mov_b32 s7, s10 8045; GFX7-NEXT: v_mov_b32_e32 v2, s9 8046; GFX7-NEXT: v_mov_b32_e32 v0, s8 8047; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8048; GFX7-NEXT: v_mov_b32_e32 v3, v0 8049; GFX7-NEXT: v_mov_b32_e32 v0, s6 8050; GFX7-NEXT: v_mov_b32_e32 v1, s7 8051; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8052; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8053; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8054; GFX7-NEXT: v_mov_b32_e32 v0, s4 8055; GFX7-NEXT: v_mov_b32_e32 v1, s5 8056; GFX7-NEXT: s_waitcnt vmcnt(0) 8057; GFX7-NEXT: flat_store_dword v[0:1], v2 8058; GFX7-NEXT: s_endpgm 8059; 8060; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8061; GFX10-WGP: ; %bb.0: ; %entry 8062; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8063; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8064; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8065; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8066; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8067; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8068; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8069; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8070; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8071; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8072; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8073; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8074; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8075; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8076; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8077; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8078; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8079; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8080; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8081; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8082; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8083; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8084; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8085; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8086; GFX10-WGP-NEXT: buffer_gl0_inv 8087; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8088; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8089; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8090; GFX10-WGP-NEXT: s_endpgm 8091; 8092; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8093; GFX10-CU: ; %bb.0: ; %entry 8094; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8095; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8096; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8097; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8098; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8099; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8100; GFX10-CU-NEXT: s_mov_b32 s6, s4 8101; GFX10-CU-NEXT: s_mov_b32 s7, s5 8102; GFX10-CU-NEXT: s_mov_b32 s11, s12 8103; GFX10-CU-NEXT: s_mov_b32 s10, s13 8104; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8105; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8106; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8107; GFX10-CU-NEXT: s_mov_b32 s7, s10 8108; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8109; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8110; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8111; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8112; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8113; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8114; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8115; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8116; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8117; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8118; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8119; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8120; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8121; GFX10-CU-NEXT: s_endpgm 8122; 8123; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8124; SKIP-CACHE-INV: ; %bb.0: ; %entry 8125; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8126; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8127; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8128; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8129; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8130; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8131; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8132; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8133; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8134; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8135; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8136; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8137; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8138; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8139; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8140; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8141; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8144; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8145; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8146; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8147; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8148; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8150; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8151; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8152; SKIP-CACHE-INV-NEXT: s_endpgm 8153; 8154; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8155; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8156; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8157; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8158; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8159; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8160; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8161; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8162; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8163; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8164; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8165; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8166; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8167; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8168; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8169; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8170; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8171; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8172; 8173; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8174; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8175; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8176; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8177; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8178; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8179; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8180; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8181; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8182; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8183; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8184; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8185; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8186; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8187; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8188; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8189; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8190; GFX90A-TGSPLIT-NEXT: s_endpgm 8191; 8192; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8193; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8194; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8195; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8196; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8197; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8198; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8199; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8200; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8201; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8202; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8203; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8204; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8205; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8206; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8207; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8208; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8209; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8210; 8211; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8212; GFX940-TGSPLIT: ; %bb.0: ; %entry 8213; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8214; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8215; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8216; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8217; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8218; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8219; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8220; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8221; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8222; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8223; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8224; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8225; GFX940-TGSPLIT-NEXT: buffer_inv sc0 8226; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8227; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8228; GFX940-TGSPLIT-NEXT: s_endpgm 8229; 8230; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8231; GFX11-WGP: ; %bb.0: ; %entry 8232; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8233; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8234; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8235; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8236; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8237; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8238; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8239; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8240; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8241; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8242; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8243; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8244; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8245; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8246; GFX11-WGP-NEXT: buffer_gl0_inv 8247; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8248; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8249; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8250; GFX11-WGP-NEXT: s_endpgm 8251; 8252; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8253; GFX11-CU: ; %bb.0: ; %entry 8254; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8255; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8256; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8257; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8258; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8259; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8260; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8261; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8262; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8263; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8264; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8265; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8266; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8267; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8268; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8269; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 8270; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8271; GFX11-CU-NEXT: s_endpgm 8272; 8273; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8274; GFX12-WGP: ; %bb.0: ; %entry 8275; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8276; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8277; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8278; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8279; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8280; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8281; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8282; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8283; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8284; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8285; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 8286; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 8287; GFX12-WGP-NEXT: s_wait_storecnt 0x0 8288; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8289; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 8290; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 8291; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 8292; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8293; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 8294; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8295; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8296; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8297; GFX12-WGP-NEXT: s_endpgm 8298; 8299; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: 8300; GFX12-CU: ; %bb.0: ; %entry 8301; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8302; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8303; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8304; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8305; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8306; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8307; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8308; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8309; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8310; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8311; GFX12-CU-NEXT: s_wait_dscnt 0x0 8312; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8313; GFX12-CU-NEXT: s_wait_dscnt 0x0 8314; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8315; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8316; GFX12-CU-NEXT: s_wait_loadcnt 0x0 8317; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8318; GFX12-CU-NEXT: s_endpgm 8319 ptr %out, i32 %in, i32 %old) { 8320entry: 8321 %gep = getelementptr i32, ptr %out, i32 4 8322 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 8323 %val0 = extractvalue { i32, i1 } %val, 0 8324 store i32 %val0, ptr %out, align 4 8325 ret void 8326} 8327 8328define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( 8329; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8330; GFX7: ; %bb.0: ; %entry 8331; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8332; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8333; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8334; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8335; GFX7-NEXT: s_mov_b64 s[12:13], 16 8336; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8337; GFX7-NEXT: s_mov_b32 s6, s4 8338; GFX7-NEXT: s_mov_b32 s7, s5 8339; GFX7-NEXT: s_mov_b32 s11, s12 8340; GFX7-NEXT: s_mov_b32 s10, s13 8341; GFX7-NEXT: s_add_u32 s6, s6, s11 8342; GFX7-NEXT: s_addc_u32 s10, s7, s10 8343; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8344; GFX7-NEXT: s_mov_b32 s7, s10 8345; GFX7-NEXT: v_mov_b32_e32 v2, s9 8346; GFX7-NEXT: v_mov_b32_e32 v0, s8 8347; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8348; GFX7-NEXT: v_mov_b32_e32 v3, v0 8349; GFX7-NEXT: v_mov_b32_e32 v0, s6 8350; GFX7-NEXT: v_mov_b32_e32 v1, s7 8351; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8352; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8353; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8354; GFX7-NEXT: v_mov_b32_e32 v0, s4 8355; GFX7-NEXT: v_mov_b32_e32 v1, s5 8356; GFX7-NEXT: s_waitcnt vmcnt(0) 8357; GFX7-NEXT: flat_store_dword v[0:1], v2 8358; GFX7-NEXT: s_endpgm 8359; 8360; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8361; GFX10-WGP: ; %bb.0: ; %entry 8362; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8363; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8364; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8365; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8366; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8367; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8368; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8369; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8370; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8371; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8372; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8373; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8374; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8375; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8376; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8377; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8378; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8379; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8380; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8381; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8382; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8383; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8384; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8385; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8386; GFX10-WGP-NEXT: buffer_gl0_inv 8387; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8388; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8389; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8390; GFX10-WGP-NEXT: s_endpgm 8391; 8392; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8393; GFX10-CU: ; %bb.0: ; %entry 8394; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8395; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8396; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8397; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8398; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8399; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8400; GFX10-CU-NEXT: s_mov_b32 s6, s4 8401; GFX10-CU-NEXT: s_mov_b32 s7, s5 8402; GFX10-CU-NEXT: s_mov_b32 s11, s12 8403; GFX10-CU-NEXT: s_mov_b32 s10, s13 8404; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8405; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8406; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8407; GFX10-CU-NEXT: s_mov_b32 s7, s10 8408; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8409; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8410; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8411; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8412; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8413; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8414; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8415; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8416; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8417; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8418; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8419; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8420; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8421; GFX10-CU-NEXT: s_endpgm 8422; 8423; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8424; SKIP-CACHE-INV: ; %bb.0: ; %entry 8425; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8426; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8427; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8428; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8429; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8430; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8431; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8432; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8433; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8434; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8435; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8436; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8437; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8438; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8439; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8440; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8441; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8443; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8444; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8445; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8446; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8447; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8450; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8451; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8452; SKIP-CACHE-INV-NEXT: s_endpgm 8453; 8454; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8455; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8456; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8457; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8458; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8459; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8460; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8461; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8462; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8463; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8464; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8465; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8466; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8467; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8468; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8469; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8470; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8471; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8472; 8473; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8474; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8475; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8476; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8477; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8478; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8479; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8480; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8481; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8482; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8483; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8484; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8485; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8486; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8487; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8488; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8489; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8490; GFX90A-TGSPLIT-NEXT: s_endpgm 8491; 8492; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8493; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8494; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8495; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8496; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8497; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8498; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8499; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8500; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8501; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8502; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8503; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8504; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8505; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8506; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8507; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8508; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8509; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8510; 8511; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8512; GFX940-TGSPLIT: ; %bb.0: ; %entry 8513; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8514; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8515; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8516; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8517; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8518; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8519; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8520; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8521; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8522; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8523; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8524; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8525; GFX940-TGSPLIT-NEXT: buffer_inv sc0 8526; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8527; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8528; GFX940-TGSPLIT-NEXT: s_endpgm 8529; 8530; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8531; GFX11-WGP: ; %bb.0: ; %entry 8532; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8533; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8534; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8535; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8536; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8537; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8538; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8539; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8540; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8541; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8542; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8543; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8544; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8545; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8546; GFX11-WGP-NEXT: buffer_gl0_inv 8547; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8548; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8549; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8550; GFX11-WGP-NEXT: s_endpgm 8551; 8552; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8553; GFX11-CU: ; %bb.0: ; %entry 8554; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8555; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8556; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8557; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8558; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8559; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8560; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8561; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8562; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8563; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8564; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8565; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8566; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8567; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8568; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8569; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 8570; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8571; GFX11-CU-NEXT: s_endpgm 8572; 8573; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8574; GFX12-WGP: ; %bb.0: ; %entry 8575; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8576; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8577; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8578; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8579; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8580; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8581; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8582; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8583; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8584; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8585; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 8586; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 8587; GFX12-WGP-NEXT: s_wait_storecnt 0x0 8588; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8589; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 8590; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 8591; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 8592; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8593; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 8594; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8595; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8596; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8597; GFX12-WGP-NEXT: s_endpgm 8598; 8599; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: 8600; GFX12-CU: ; %bb.0: ; %entry 8601; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8602; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8603; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8604; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8605; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8606; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8607; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8608; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8609; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8610; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8611; GFX12-CU-NEXT: s_wait_dscnt 0x0 8612; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8613; GFX12-CU-NEXT: s_wait_dscnt 0x0 8614; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8615; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8616; GFX12-CU-NEXT: s_wait_loadcnt 0x0 8617; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8618; GFX12-CU-NEXT: s_endpgm 8619 ptr %out, i32 %in, i32 %old) { 8620entry: 8621 %gep = getelementptr i32, ptr %out, i32 4 8622 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst 8623 %val0 = extractvalue { i32, i1 } %val, 0 8624 store i32 %val0, ptr %out, align 4 8625 ret void 8626} 8627 8628define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( 8629; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8630; GFX7: ; %bb.0: ; %entry 8631; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8632; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8633; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8634; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8635; GFX7-NEXT: s_mov_b64 s[12:13], 16 8636; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8637; GFX7-NEXT: s_mov_b32 s6, s4 8638; GFX7-NEXT: s_mov_b32 s7, s5 8639; GFX7-NEXT: s_mov_b32 s11, s12 8640; GFX7-NEXT: s_mov_b32 s10, s13 8641; GFX7-NEXT: s_add_u32 s6, s6, s11 8642; GFX7-NEXT: s_addc_u32 s10, s7, s10 8643; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8644; GFX7-NEXT: s_mov_b32 s7, s10 8645; GFX7-NEXT: v_mov_b32_e32 v2, s9 8646; GFX7-NEXT: v_mov_b32_e32 v0, s8 8647; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8648; GFX7-NEXT: v_mov_b32_e32 v3, v0 8649; GFX7-NEXT: v_mov_b32_e32 v0, s6 8650; GFX7-NEXT: v_mov_b32_e32 v1, s7 8651; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8652; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8653; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8654; GFX7-NEXT: v_mov_b32_e32 v0, s4 8655; GFX7-NEXT: v_mov_b32_e32 v1, s5 8656; GFX7-NEXT: s_waitcnt vmcnt(0) 8657; GFX7-NEXT: flat_store_dword v[0:1], v2 8658; GFX7-NEXT: s_endpgm 8659; 8660; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8661; GFX10-WGP: ; %bb.0: ; %entry 8662; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8663; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8664; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8665; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8666; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8667; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8668; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8669; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8670; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8671; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8672; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8673; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8674; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8675; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8676; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8677; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8678; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8679; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8680; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8681; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8682; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8683; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8684; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8685; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8686; GFX10-WGP-NEXT: buffer_gl0_inv 8687; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8688; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8689; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8690; GFX10-WGP-NEXT: s_endpgm 8691; 8692; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8693; GFX10-CU: ; %bb.0: ; %entry 8694; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8695; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8696; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8697; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8698; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8699; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8700; GFX10-CU-NEXT: s_mov_b32 s6, s4 8701; GFX10-CU-NEXT: s_mov_b32 s7, s5 8702; GFX10-CU-NEXT: s_mov_b32 s11, s12 8703; GFX10-CU-NEXT: s_mov_b32 s10, s13 8704; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8705; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8706; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8707; GFX10-CU-NEXT: s_mov_b32 s7, s10 8708; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8709; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8710; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8711; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8712; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8713; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8714; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8715; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8716; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8717; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8718; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8719; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 8720; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8721; GFX10-CU-NEXT: s_endpgm 8722; 8723; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8724; SKIP-CACHE-INV: ; %bb.0: ; %entry 8725; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8726; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8727; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8728; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8729; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8730; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8731; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8732; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8733; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8734; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8735; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8736; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8737; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8738; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8739; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8740; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8741; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8742; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8743; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8744; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8745; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8746; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8747; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8748; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8749; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8750; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 8751; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8752; SKIP-CACHE-INV-NEXT: s_endpgm 8753; 8754; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8755; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8756; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8757; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8758; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8759; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8760; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8761; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8762; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8763; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8764; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8765; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8766; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8767; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8768; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8769; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8770; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8771; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8772; 8773; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8774; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8775; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8776; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8777; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8778; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8779; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8780; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8781; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8782; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8783; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8784; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8785; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8786; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8787; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 8788; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8789; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8790; GFX90A-TGSPLIT-NEXT: s_endpgm 8791; 8792; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8793; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8794; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8795; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8796; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8797; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8798; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8799; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8800; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8801; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8802; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8803; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8804; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8805; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8806; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8807; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 8808; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8809; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8810; 8811; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8812; GFX940-TGSPLIT: ; %bb.0: ; %entry 8813; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8814; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8815; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8816; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8817; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8818; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8819; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8820; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8821; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8822; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8823; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8824; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8825; GFX940-TGSPLIT-NEXT: buffer_inv sc0 8826; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8827; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8828; GFX940-TGSPLIT-NEXT: s_endpgm 8829; 8830; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8831; GFX11-WGP: ; %bb.0: ; %entry 8832; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8833; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8834; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8835; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8836; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8837; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8838; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8839; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8840; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8841; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8842; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8843; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8844; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8845; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8846; GFX11-WGP-NEXT: buffer_gl0_inv 8847; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8848; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8849; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8850; GFX11-WGP-NEXT: s_endpgm 8851; 8852; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8853; GFX11-CU: ; %bb.0: ; %entry 8854; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8855; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8856; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8857; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8858; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8859; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8860; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8861; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8862; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8863; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8864; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8865; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8866; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8867; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8868; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8869; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 8870; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8871; GFX11-CU-NEXT: s_endpgm 8872; 8873; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8874; GFX12-WGP: ; %bb.0: ; %entry 8875; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8876; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8877; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8878; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8879; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8880; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8881; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8882; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8883; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8884; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8885; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 8886; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 8887; GFX12-WGP-NEXT: s_wait_storecnt 0x0 8888; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8889; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 8890; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8891; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 8892; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8893; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8894; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8895; GFX12-WGP-NEXT: s_endpgm 8896; 8897; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: 8898; GFX12-CU: ; %bb.0: ; %entry 8899; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8900; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8901; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8902; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8903; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8904; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8905; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8906; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8907; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8908; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8909; GFX12-CU-NEXT: s_wait_dscnt 0x0 8910; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8911; GFX12-CU-NEXT: s_wait_dscnt 0x0 8912; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8913; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8914; GFX12-CU-NEXT: s_wait_loadcnt 0x0 8915; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8916; GFX12-CU-NEXT: s_endpgm 8917 ptr %out, i32 %in, i32 %old) { 8918entry: 8919 %gep = getelementptr i32, ptr %out, i32 4 8920 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst 8921 %val0 = extractvalue { i32, i1 } %val, 0 8922 store i32 %val0, ptr %out, align 4 8923 ret void 8924} 8925 8926define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( 8927; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 8928; GFX7: ; %bb.0: ; %entry 8929; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8930; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8931; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8932; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8933; GFX7-NEXT: s_mov_b64 s[12:13], 16 8934; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8935; GFX7-NEXT: s_mov_b32 s6, s4 8936; GFX7-NEXT: s_mov_b32 s7, s5 8937; GFX7-NEXT: s_mov_b32 s11, s12 8938; GFX7-NEXT: s_mov_b32 s10, s13 8939; GFX7-NEXT: s_add_u32 s6, s6, s11 8940; GFX7-NEXT: s_addc_u32 s10, s7, s10 8941; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8942; GFX7-NEXT: s_mov_b32 s7, s10 8943; GFX7-NEXT: v_mov_b32_e32 v2, s9 8944; GFX7-NEXT: v_mov_b32_e32 v0, s8 8945; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8946; GFX7-NEXT: v_mov_b32_e32 v3, v0 8947; GFX7-NEXT: v_mov_b32_e32 v0, s6 8948; GFX7-NEXT: v_mov_b32_e32 v1, s7 8949; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8950; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8951; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8952; GFX7-NEXT: v_mov_b32_e32 v0, s4 8953; GFX7-NEXT: v_mov_b32_e32 v1, s5 8954; GFX7-NEXT: s_waitcnt vmcnt(0) 8955; GFX7-NEXT: flat_store_dword v[0:1], v2 8956; GFX7-NEXT: s_endpgm 8957; 8958; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 8959; GFX10-WGP: ; %bb.0: ; %entry 8960; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8961; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8962; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8963; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8964; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8965; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8966; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8967; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8968; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8969; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8970; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8971; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8972; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8973; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8974; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8975; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8976; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8977; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8978; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8979; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8980; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8981; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 8982; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8983; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8984; GFX10-WGP-NEXT: buffer_gl0_inv 8985; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8986; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8987; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8988; GFX10-WGP-NEXT: s_endpgm 8989; 8990; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 8991; GFX10-CU: ; %bb.0: ; %entry 8992; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8993; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8994; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8995; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8996; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8997; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8998; GFX10-CU-NEXT: s_mov_b32 s6, s4 8999; GFX10-CU-NEXT: s_mov_b32 s7, s5 9000; GFX10-CU-NEXT: s_mov_b32 s11, s12 9001; GFX10-CU-NEXT: s_mov_b32 s10, s13 9002; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9003; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9004; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9005; GFX10-CU-NEXT: s_mov_b32 s7, s10 9006; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9007; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9008; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9009; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9010; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9011; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9012; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9013; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9014; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9015; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9016; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9017; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9018; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9019; GFX10-CU-NEXT: s_endpgm 9020; 9021; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9022; SKIP-CACHE-INV: ; %bb.0: ; %entry 9023; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9024; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9025; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9026; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9027; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9028; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9029; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9030; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9031; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9032; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9033; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9034; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9035; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9036; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9037; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9038; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9039; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9040; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9041; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9042; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9043; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9044; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9045; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9046; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9047; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9048; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9049; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9050; SKIP-CACHE-INV-NEXT: s_endpgm 9051; 9052; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9053; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9054; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9055; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9056; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9057; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9058; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9059; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9060; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9061; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9062; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9063; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9064; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9065; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9066; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9067; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9068; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9069; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9070; 9071; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9072; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9073; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9074; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9075; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9076; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9077; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9078; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9079; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9080; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9081; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9082; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9083; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9084; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9085; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9086; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9087; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9088; GFX90A-TGSPLIT-NEXT: s_endpgm 9089; 9090; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9091; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9092; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9093; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9094; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9095; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9096; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9097; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9098; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9099; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9100; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9101; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9102; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9103; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9104; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9105; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9106; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9107; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9108; 9109; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9110; GFX940-TGSPLIT: ; %bb.0: ; %entry 9111; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9112; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9113; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9114; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9115; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9116; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9117; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9118; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9119; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9120; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9121; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9122; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9123; GFX940-TGSPLIT-NEXT: buffer_inv sc0 9124; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9125; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9126; GFX940-TGSPLIT-NEXT: s_endpgm 9127; 9128; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9129; GFX11-WGP: ; %bb.0: ; %entry 9130; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9131; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9132; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9133; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9134; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9135; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9136; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9137; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9138; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9139; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9140; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9141; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9142; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9143; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9144; GFX11-WGP-NEXT: buffer_gl0_inv 9145; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9146; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9147; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9148; GFX11-WGP-NEXT: s_endpgm 9149; 9150; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9151; GFX11-CU: ; %bb.0: ; %entry 9152; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9153; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9154; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9155; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9156; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9157; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9158; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9159; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9160; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9161; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9162; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9163; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9164; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9165; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9166; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9167; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 9168; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9169; GFX11-CU-NEXT: s_endpgm 9170; 9171; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9172; GFX12-WGP: ; %bb.0: ; %entry 9173; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9174; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9175; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9176; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9177; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9178; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9179; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9180; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9181; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9182; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9183; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9184; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9185; GFX12-WGP-NEXT: s_wait_storecnt 0x0 9186; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9187; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 9188; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9189; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9190; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9191; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 9192; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9193; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9194; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9195; GFX12-WGP-NEXT: s_endpgm 9196; 9197; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: 9198; GFX12-CU: ; %bb.0: ; %entry 9199; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9200; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9201; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9202; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9203; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9204; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9205; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9206; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9207; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9208; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9209; GFX12-CU-NEXT: s_wait_dscnt 0x0 9210; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9211; GFX12-CU-NEXT: s_wait_dscnt 0x0 9212; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9213; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9214; GFX12-CU-NEXT: s_wait_loadcnt 0x0 9215; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9216; GFX12-CU-NEXT: s_endpgm 9217 ptr %out, i32 %in, i32 %old) { 9218entry: 9219 %gep = getelementptr i32, ptr %out, i32 4 9220 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst 9221 %val0 = extractvalue { i32, i1 } %val, 0 9222 store i32 %val0, ptr %out, align 4 9223 ret void 9224} 9225 9226define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( 9227; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9228; GFX7: ; %bb.0: ; %entry 9229; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 9230; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9231; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 9232; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 9233; GFX7-NEXT: s_mov_b64 s[12:13], 16 9234; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9235; GFX7-NEXT: s_mov_b32 s6, s4 9236; GFX7-NEXT: s_mov_b32 s7, s5 9237; GFX7-NEXT: s_mov_b32 s11, s12 9238; GFX7-NEXT: s_mov_b32 s10, s13 9239; GFX7-NEXT: s_add_u32 s6, s6, s11 9240; GFX7-NEXT: s_addc_u32 s10, s7, s10 9241; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9242; GFX7-NEXT: s_mov_b32 s7, s10 9243; GFX7-NEXT: v_mov_b32_e32 v2, s9 9244; GFX7-NEXT: v_mov_b32_e32 v0, s8 9245; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9246; GFX7-NEXT: v_mov_b32_e32 v3, v0 9247; GFX7-NEXT: v_mov_b32_e32 v0, s6 9248; GFX7-NEXT: v_mov_b32_e32 v1, s7 9249; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9250; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9251; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9252; GFX7-NEXT: v_mov_b32_e32 v0, s4 9253; GFX7-NEXT: v_mov_b32_e32 v1, s5 9254; GFX7-NEXT: s_waitcnt vmcnt(0) 9255; GFX7-NEXT: flat_store_dword v[0:1], v2 9256; GFX7-NEXT: s_endpgm 9257; 9258; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9259; GFX10-WGP: ; %bb.0: ; %entry 9260; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 9261; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9262; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 9263; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 9264; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 9265; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9266; GFX10-WGP-NEXT: s_mov_b32 s6, s4 9267; GFX10-WGP-NEXT: s_mov_b32 s7, s5 9268; GFX10-WGP-NEXT: s_mov_b32 s11, s12 9269; GFX10-WGP-NEXT: s_mov_b32 s10, s13 9270; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 9271; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 9272; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9273; GFX10-WGP-NEXT: s_mov_b32 s7, s10 9274; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 9275; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 9276; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9277; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 9278; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9279; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9280; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9281; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9282; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9283; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9284; GFX10-WGP-NEXT: buffer_gl0_inv 9285; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9286; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9287; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9288; GFX10-WGP-NEXT: s_endpgm 9289; 9290; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9291; GFX10-CU: ; %bb.0: ; %entry 9292; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 9293; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9294; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 9295; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 9296; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 9297; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9298; GFX10-CU-NEXT: s_mov_b32 s6, s4 9299; GFX10-CU-NEXT: s_mov_b32 s7, s5 9300; GFX10-CU-NEXT: s_mov_b32 s11, s12 9301; GFX10-CU-NEXT: s_mov_b32 s10, s13 9302; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9303; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9304; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9305; GFX10-CU-NEXT: s_mov_b32 s7, s10 9306; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9307; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9308; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9309; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9310; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9311; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9312; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9313; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9314; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9315; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9316; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9317; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9318; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9319; GFX10-CU-NEXT: s_endpgm 9320; 9321; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9322; SKIP-CACHE-INV: ; %bb.0: ; %entry 9323; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9324; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9325; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9326; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9327; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9328; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9329; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9330; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9331; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9332; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9333; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9334; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9335; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9336; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9337; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9338; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9339; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9340; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9341; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9342; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9343; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9344; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9345; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9347; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9348; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9349; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9350; SKIP-CACHE-INV-NEXT: s_endpgm 9351; 9352; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9353; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9354; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9355; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9356; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9357; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9358; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9359; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9360; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9361; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9362; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9363; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9364; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9365; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9366; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9367; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9368; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9369; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9370; 9371; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9372; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9373; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9374; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9375; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9376; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9377; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9378; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9379; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9380; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9381; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9382; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9383; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9384; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9385; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9386; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9387; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9388; GFX90A-TGSPLIT-NEXT: s_endpgm 9389; 9390; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9391; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9392; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9393; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9394; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9395; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9396; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9397; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9398; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9399; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9400; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9401; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9402; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9403; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9404; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9405; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9406; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9407; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9408; 9409; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9410; GFX940-TGSPLIT: ; %bb.0: ; %entry 9411; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9412; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9413; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9414; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9415; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9416; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9417; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9418; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9419; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9420; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9421; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9422; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9423; GFX940-TGSPLIT-NEXT: buffer_inv sc0 9424; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9425; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9426; GFX940-TGSPLIT-NEXT: s_endpgm 9427; 9428; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9429; GFX11-WGP: ; %bb.0: ; %entry 9430; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9431; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9432; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9433; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9434; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9435; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9436; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9437; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9438; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9439; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9440; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9441; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9442; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9443; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9444; GFX11-WGP-NEXT: buffer_gl0_inv 9445; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9446; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9447; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9448; GFX11-WGP-NEXT: s_endpgm 9449; 9450; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9451; GFX11-CU: ; %bb.0: ; %entry 9452; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9453; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9454; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9455; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9456; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9457; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9458; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9459; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9460; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9461; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9462; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9463; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9464; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9465; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9466; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9467; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 9468; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9469; GFX11-CU-NEXT: s_endpgm 9470; 9471; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9472; GFX12-WGP: ; %bb.0: ; %entry 9473; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9474; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9475; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9476; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9477; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9478; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9479; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9480; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9481; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9482; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9483; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9484; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9485; GFX12-WGP-NEXT: s_wait_storecnt 0x0 9486; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9487; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 9488; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9489; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9490; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9491; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 9492; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9493; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9494; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9495; GFX12-WGP-NEXT: s_endpgm 9496; 9497; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: 9498; GFX12-CU: ; %bb.0: ; %entry 9499; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9500; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9501; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9502; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9503; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9504; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9505; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9506; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9507; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9508; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9509; GFX12-CU-NEXT: s_wait_dscnt 0x0 9510; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9511; GFX12-CU-NEXT: s_wait_dscnt 0x0 9512; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9513; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9514; GFX12-CU-NEXT: s_wait_loadcnt 0x0 9515; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9516; GFX12-CU-NEXT: s_endpgm 9517 ptr %out, i32 %in, i32 %old) { 9518entry: 9519 %gep = getelementptr i32, ptr %out, i32 4 9520 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst 9521 %val0 = extractvalue { i32, i1 } %val, 0 9522 store i32 %val0, ptr %out, align 4 9523 ret void 9524} 9525 9526define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( 9527; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9528; GFX7: ; %bb.0: ; %entry 9529; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 9530; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9531; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 9532; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 9533; GFX7-NEXT: s_mov_b64 s[12:13], 16 9534; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9535; GFX7-NEXT: s_mov_b32 s6, s4 9536; GFX7-NEXT: s_mov_b32 s7, s5 9537; GFX7-NEXT: s_mov_b32 s11, s12 9538; GFX7-NEXT: s_mov_b32 s10, s13 9539; GFX7-NEXT: s_add_u32 s6, s6, s11 9540; GFX7-NEXT: s_addc_u32 s10, s7, s10 9541; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9542; GFX7-NEXT: s_mov_b32 s7, s10 9543; GFX7-NEXT: v_mov_b32_e32 v2, s9 9544; GFX7-NEXT: v_mov_b32_e32 v0, s8 9545; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9546; GFX7-NEXT: v_mov_b32_e32 v3, v0 9547; GFX7-NEXT: v_mov_b32_e32 v0, s6 9548; GFX7-NEXT: v_mov_b32_e32 v1, s7 9549; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9550; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9551; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9552; GFX7-NEXT: v_mov_b32_e32 v0, s4 9553; GFX7-NEXT: v_mov_b32_e32 v1, s5 9554; GFX7-NEXT: s_waitcnt vmcnt(0) 9555; GFX7-NEXT: flat_store_dword v[0:1], v2 9556; GFX7-NEXT: s_endpgm 9557; 9558; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9559; GFX10-WGP: ; %bb.0: ; %entry 9560; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 9561; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9562; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 9563; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 9564; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 9565; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9566; GFX10-WGP-NEXT: s_mov_b32 s6, s4 9567; GFX10-WGP-NEXT: s_mov_b32 s7, s5 9568; GFX10-WGP-NEXT: s_mov_b32 s11, s12 9569; GFX10-WGP-NEXT: s_mov_b32 s10, s13 9570; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 9571; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 9572; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9573; GFX10-WGP-NEXT: s_mov_b32 s7, s10 9574; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 9575; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 9576; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9577; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 9578; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9579; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9580; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9581; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9582; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9583; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9584; GFX10-WGP-NEXT: buffer_gl0_inv 9585; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9586; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9587; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9588; GFX10-WGP-NEXT: s_endpgm 9589; 9590; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9591; GFX10-CU: ; %bb.0: ; %entry 9592; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 9593; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9594; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 9595; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 9596; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 9597; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9598; GFX10-CU-NEXT: s_mov_b32 s6, s4 9599; GFX10-CU-NEXT: s_mov_b32 s7, s5 9600; GFX10-CU-NEXT: s_mov_b32 s11, s12 9601; GFX10-CU-NEXT: s_mov_b32 s10, s13 9602; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9603; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9604; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9605; GFX10-CU-NEXT: s_mov_b32 s7, s10 9606; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9607; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9608; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9609; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9610; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9611; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9612; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9613; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9614; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9615; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9616; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9617; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 9618; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9619; GFX10-CU-NEXT: s_endpgm 9620; 9621; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9622; SKIP-CACHE-INV: ; %bb.0: ; %entry 9623; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9624; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9625; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9626; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9627; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9628; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9629; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9630; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9631; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9632; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9633; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9634; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9635; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9636; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9637; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9638; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9639; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9640; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9641; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9642; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9643; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9644; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9645; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9648; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 9649; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9650; SKIP-CACHE-INV-NEXT: s_endpgm 9651; 9652; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9653; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9654; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9655; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9656; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9657; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9658; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9659; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9660; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9661; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9662; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9663; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9664; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9665; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9666; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9667; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9668; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9669; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9670; 9671; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9672; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9673; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9674; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9675; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9676; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9677; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9678; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9679; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9680; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9681; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9682; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9683; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9684; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9685; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 9686; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9687; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9688; GFX90A-TGSPLIT-NEXT: s_endpgm 9689; 9690; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9691; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9692; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9693; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9694; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9695; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9696; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9697; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9698; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9699; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9700; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9701; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9702; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9703; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9704; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9705; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 9706; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9707; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9708; 9709; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9710; GFX940-TGSPLIT: ; %bb.0: ; %entry 9711; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9712; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9713; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9714; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9715; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9716; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9717; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9718; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9719; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9720; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9721; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9722; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9723; GFX940-TGSPLIT-NEXT: buffer_inv sc0 9724; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9725; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9726; GFX940-TGSPLIT-NEXT: s_endpgm 9727; 9728; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9729; GFX11-WGP: ; %bb.0: ; %entry 9730; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9731; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9732; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9733; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9734; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9735; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9736; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9737; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9738; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9739; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9740; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9741; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 9742; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9743; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9744; GFX11-WGP-NEXT: buffer_gl0_inv 9745; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9746; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9747; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9748; GFX11-WGP-NEXT: s_endpgm 9749; 9750; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9751; GFX11-CU: ; %bb.0: ; %entry 9752; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9753; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9754; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9755; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9756; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9757; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9758; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9759; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9760; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9761; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9762; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9763; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9764; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9765; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9766; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9767; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 9768; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9769; GFX11-CU-NEXT: s_endpgm 9770; 9771; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9772; GFX12-WGP: ; %bb.0: ; %entry 9773; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9774; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9775; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9776; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9777; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9778; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9779; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9780; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9781; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9782; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9783; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9784; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9785; GFX12-WGP-NEXT: s_wait_storecnt 0x0 9786; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9787; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 9788; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 9789; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 9790; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9791; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 9792; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9793; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9794; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9795; GFX12-WGP-NEXT: s_endpgm 9796; 9797; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: 9798; GFX12-CU: ; %bb.0: ; %entry 9799; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9800; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9801; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9802; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9803; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9804; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9805; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9806; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9807; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9808; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9809; GFX12-CU-NEXT: s_wait_dscnt 0x0 9810; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9811; GFX12-CU-NEXT: s_wait_dscnt 0x0 9812; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9813; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9814; GFX12-CU-NEXT: s_wait_loadcnt 0x0 9815; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9816; GFX12-CU-NEXT: s_endpgm 9817 ptr %out, i32 %in, i32 %old) { 9818entry: 9819 %gep = getelementptr i32, ptr %out, i32 4 9820 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 9821 %val0 = extractvalue { i32, i1 } %val, 0 9822 store i32 %val0, ptr %out, align 4 9823 ret void 9824} 9825 9826define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( 9827; GFX7-LABEL: flat_workgroup_one_as_unordered_load: 9828; GFX7: ; %bb.0: ; %entry 9829; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9830; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 9831; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9832; GFX7-NEXT: v_mov_b32_e32 v0, s6 9833; GFX7-NEXT: v_mov_b32_e32 v1, s7 9834; GFX7-NEXT: flat_load_dword v2, v[0:1] 9835; GFX7-NEXT: v_mov_b32_e32 v0, s4 9836; GFX7-NEXT: v_mov_b32_e32 v1, s5 9837; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9838; GFX7-NEXT: flat_store_dword v[0:1], v2 9839; GFX7-NEXT: s_endpgm 9840; 9841; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: 9842; GFX10-WGP: ; %bb.0: ; %entry 9843; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9844; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9845; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9846; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9847; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9848; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 9849; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9850; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9851; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9852; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9853; GFX10-WGP-NEXT: s_endpgm 9854; 9855; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: 9856; GFX10-CU: ; %bb.0: ; %entry 9857; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9858; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9859; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9860; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9861; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9862; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 9863; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9864; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9865; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9866; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9867; GFX10-CU-NEXT: s_endpgm 9868; 9869; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load: 9870; SKIP-CACHE-INV: ; %bb.0: ; %entry 9871; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9872; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 9873; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9874; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9876; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 9877; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9879; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9880; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9881; SKIP-CACHE-INV-NEXT: s_endpgm 9882; 9883; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: 9884; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9885; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9886; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9887; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9888; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 9889; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9890; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9891; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9892; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9893; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9894; 9895; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: 9896; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9897; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9898; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9899; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9900; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 9901; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9902; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9903; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9904; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9905; GFX90A-TGSPLIT-NEXT: s_endpgm 9906; 9907; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: 9908; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9909; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9910; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 9911; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9912; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9913; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9914; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9915; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9916; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9917; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9918; 9919; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: 9920; GFX940-TGSPLIT: ; %bb.0: ; %entry 9921; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9922; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 9923; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9924; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9925; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9926; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9927; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9928; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9929; GFX940-TGSPLIT-NEXT: s_endpgm 9930; 9931; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load: 9932; GFX11-WGP: ; %bb.0: ; %entry 9933; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9934; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9935; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9936; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9937; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 9938; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 9939; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9940; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9941; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9942; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9943; GFX11-WGP-NEXT: s_endpgm 9944; 9945; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load: 9946; GFX11-CU: ; %bb.0: ; %entry 9947; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9948; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9949; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9950; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9951; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 9952; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 9953; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9954; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9955; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9956; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9957; GFX11-CU-NEXT: s_endpgm 9958; 9959; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load: 9960; GFX12-WGP: ; %bb.0: ; %entry 9961; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9962; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9963; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9964; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9965; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 9966; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 9967; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9968; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9969; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9970; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9971; GFX12-WGP-NEXT: s_endpgm 9972; 9973; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load: 9974; GFX12-CU: ; %bb.0: ; %entry 9975; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9976; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9977; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9978; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9979; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 9980; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 9981; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9982; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9983; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 9984; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9985; GFX12-CU-NEXT: s_endpgm 9986 ptr %in, ptr %out) { 9987entry: 9988 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4 9989 store i32 %val, ptr %out 9990 ret void 9991} 9992 9993define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( 9994; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: 9995; GFX7: ; %bb.0: ; %entry 9996; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9997; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 9998; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9999; GFX7-NEXT: v_mov_b32_e32 v0, s6 10000; GFX7-NEXT: v_mov_b32_e32 v1, s7 10001; GFX7-NEXT: flat_load_dword v2, v[0:1] 10002; GFX7-NEXT: v_mov_b32_e32 v0, s4 10003; GFX7-NEXT: v_mov_b32_e32 v1, s5 10004; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10005; GFX7-NEXT: flat_store_dword v[0:1], v2 10006; GFX7-NEXT: s_endpgm 10007; 10008; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: 10009; GFX10-WGP: ; %bb.0: ; %entry 10010; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10011; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10012; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10013; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10014; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10015; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 10016; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10017; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10018; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10019; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10020; GFX10-WGP-NEXT: s_endpgm 10021; 10022; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: 10023; GFX10-CU: ; %bb.0: ; %entry 10024; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10025; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10026; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10027; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10028; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10029; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10030; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10031; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10032; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10033; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10034; GFX10-CU-NEXT: s_endpgm 10035; 10036; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load: 10037; SKIP-CACHE-INV: ; %bb.0: ; %entry 10038; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10039; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10040; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10041; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10042; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10043; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10044; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10045; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10046; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10047; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10048; SKIP-CACHE-INV-NEXT: s_endpgm 10049; 10050; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: 10051; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10052; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10053; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10054; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10055; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10056; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10057; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10058; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10059; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10060; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10061; 10062; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: 10063; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10064; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10065; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10066; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10067; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10068; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 10069; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10070; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10071; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10072; GFX90A-TGSPLIT-NEXT: s_endpgm 10073; 10074; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: 10075; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10076; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10077; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10078; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10079; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10080; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10081; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10082; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10083; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10084; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10085; 10086; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: 10087; GFX940-TGSPLIT: ; %bb.0: ; %entry 10088; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10089; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10090; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10091; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10092; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10093; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10094; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10095; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10096; GFX940-TGSPLIT-NEXT: s_endpgm 10097; 10098; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load: 10099; GFX11-WGP: ; %bb.0: ; %entry 10100; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10101; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10102; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10103; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10104; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10105; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 10106; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10107; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10108; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10109; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10110; GFX11-WGP-NEXT: s_endpgm 10111; 10112; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load: 10113; GFX11-CU: ; %bb.0: ; %entry 10114; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10115; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10116; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10117; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10118; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10119; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10120; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10121; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10122; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10123; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10124; GFX11-CU-NEXT: s_endpgm 10125; 10126; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load: 10127; GFX12-WGP: ; %bb.0: ; %entry 10128; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10129; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10130; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10131; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10132; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10133; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 10134; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10135; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10136; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 10137; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10138; GFX12-WGP-NEXT: s_endpgm 10139; 10140; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load: 10141; GFX12-CU: ; %bb.0: ; %entry 10142; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10143; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10144; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10145; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10146; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10147; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10148; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10149; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10150; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10151; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10152; GFX12-CU-NEXT: s_endpgm 10153 ptr %in, ptr %out) { 10154entry: 10155 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4 10156 store i32 %val, ptr %out 10157 ret void 10158} 10159 10160define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( 10161; GFX7-LABEL: flat_workgroup_one_as_acquire_load: 10162; GFX7: ; %bb.0: ; %entry 10163; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10164; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 10165; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10166; GFX7-NEXT: v_mov_b32_e32 v0, s6 10167; GFX7-NEXT: v_mov_b32_e32 v1, s7 10168; GFX7-NEXT: flat_load_dword v2, v[0:1] 10169; GFX7-NEXT: v_mov_b32_e32 v0, s4 10170; GFX7-NEXT: v_mov_b32_e32 v1, s5 10171; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10172; GFX7-NEXT: flat_store_dword v[0:1], v2 10173; GFX7-NEXT: s_endpgm 10174; 10175; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: 10176; GFX10-WGP: ; %bb.0: ; %entry 10177; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10178; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10179; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10180; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10181; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10182; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 10183; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 10184; GFX10-WGP-NEXT: buffer_gl0_inv 10185; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10186; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10187; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10188; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10189; GFX10-WGP-NEXT: s_endpgm 10190; 10191; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: 10192; GFX10-CU: ; %bb.0: ; %entry 10193; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10194; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10195; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10196; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10197; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10198; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10199; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10200; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10201; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10202; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10203; GFX10-CU-NEXT: s_endpgm 10204; 10205; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load: 10206; SKIP-CACHE-INV: ; %bb.0: ; %entry 10207; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10208; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10209; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10210; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10211; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10212; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10213; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10214; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10215; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10216; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10217; SKIP-CACHE-INV-NEXT: s_endpgm 10218; 10219; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: 10220; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10221; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10222; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10223; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10224; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10225; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10226; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10227; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10228; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10229; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10230; 10231; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: 10232; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10233; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10234; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10235; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10236; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10237; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 10238; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10239; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 10240; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10241; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10242; GFX90A-TGSPLIT-NEXT: s_endpgm 10243; 10244; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: 10245; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10246; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10247; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10248; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10249; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10250; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10251; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10252; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10253; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10254; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10255; 10256; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: 10257; GFX940-TGSPLIT: ; %bb.0: ; %entry 10258; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10259; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10260; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10261; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10262; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10263; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10264; GFX940-TGSPLIT-NEXT: buffer_inv sc0 10265; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10266; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10267; GFX940-TGSPLIT-NEXT: s_endpgm 10268; 10269; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load: 10270; GFX11-WGP: ; %bb.0: ; %entry 10271; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10272; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10273; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10274; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10275; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10276; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 10277; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 10278; GFX11-WGP-NEXT: buffer_gl0_inv 10279; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10280; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10281; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10282; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10283; GFX11-WGP-NEXT: s_endpgm 10284; 10285; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load: 10286; GFX11-CU: ; %bb.0: ; %entry 10287; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10288; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10289; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10290; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10291; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10292; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10293; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10294; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10295; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10296; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10297; GFX11-CU-NEXT: s_endpgm 10298; 10299; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load: 10300; GFX12-WGP: ; %bb.0: ; %entry 10301; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10302; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10303; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10304; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10305; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10306; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 10307; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 10308; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 10309; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10310; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10311; GFX12-WGP-NEXT: s_wait_dscnt 0x0 10312; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10313; GFX12-WGP-NEXT: s_endpgm 10314; 10315; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load: 10316; GFX12-CU: ; %bb.0: ; %entry 10317; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10318; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10319; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10320; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10321; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10322; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10323; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10324; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10325; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10326; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10327; GFX12-CU-NEXT: s_endpgm 10328 ptr %in, ptr %out) { 10329entry: 10330 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4 10331 store i32 %val, ptr %out 10332 ret void 10333} 10334 10335define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( 10336; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: 10337; GFX7: ; %bb.0: ; %entry 10338; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10339; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 10340; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10341; GFX7-NEXT: v_mov_b32_e32 v0, s6 10342; GFX7-NEXT: v_mov_b32_e32 v1, s7 10343; GFX7-NEXT: flat_load_dword v2, v[0:1] 10344; GFX7-NEXT: v_mov_b32_e32 v0, s4 10345; GFX7-NEXT: v_mov_b32_e32 v1, s5 10346; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10347; GFX7-NEXT: flat_store_dword v[0:1], v2 10348; GFX7-NEXT: s_endpgm 10349; 10350; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: 10351; GFX10-WGP: ; %bb.0: ; %entry 10352; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10353; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10354; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10355; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10356; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10357; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 10358; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 10359; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc 10360; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 10361; GFX10-WGP-NEXT: buffer_gl0_inv 10362; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10363; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10364; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10365; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10366; GFX10-WGP-NEXT: s_endpgm 10367; 10368; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: 10369; GFX10-CU: ; %bb.0: ; %entry 10370; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10371; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10372; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10373; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10374; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10375; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10376; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10377; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10378; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10379; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10380; GFX10-CU-NEXT: s_endpgm 10381; 10382; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load: 10383; SKIP-CACHE-INV: ; %bb.0: ; %entry 10384; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10385; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10386; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10387; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10388; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10389; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10390; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10391; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10392; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10393; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10394; SKIP-CACHE-INV-NEXT: s_endpgm 10395; 10396; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: 10397; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10398; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10399; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10400; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10401; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10402; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10403; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10404; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10405; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10406; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10407; 10408; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: 10409; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10410; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10411; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10412; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10413; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10414; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10415; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc 10416; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10417; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 10418; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10419; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10420; GFX90A-TGSPLIT-NEXT: s_endpgm 10421; 10422; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: 10423; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10424; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10425; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10426; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10427; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10428; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10429; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10430; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10431; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10432; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10433; 10434; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: 10435; GFX940-TGSPLIT: ; %bb.0: ; %entry 10436; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10437; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10438; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10439; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10440; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10441; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 10442; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10443; GFX940-TGSPLIT-NEXT: buffer_inv sc0 10444; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10445; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10446; GFX940-TGSPLIT-NEXT: s_endpgm 10447; 10448; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: 10449; GFX11-WGP: ; %bb.0: ; %entry 10450; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10451; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10452; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10453; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10454; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10455; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 10456; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 10457; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc 10458; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 10459; GFX11-WGP-NEXT: buffer_gl0_inv 10460; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10461; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10462; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10463; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10464; GFX11-WGP-NEXT: s_endpgm 10465; 10466; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load: 10467; GFX11-CU: ; %bb.0: ; %entry 10468; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10469; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10470; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10471; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10472; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10473; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10474; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10475; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10476; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10477; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10478; GFX11-CU-NEXT: s_endpgm 10479; 10480; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: 10481; GFX12-WGP: ; %bb.0: ; %entry 10482; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10483; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10484; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10485; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10486; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10487; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 10488; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 10489; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 10490; GFX12-WGP-NEXT: s_wait_storecnt 0x0 10491; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE 10492; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 10493; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 10494; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 10495; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 10496; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10497; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10498; GFX12-WGP-NEXT: s_wait_dscnt 0x0 10499; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10500; GFX12-WGP-NEXT: s_endpgm 10501; 10502; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load: 10503; GFX12-CU: ; %bb.0: ; %entry 10504; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10505; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10506; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10507; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10508; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10509; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10510; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10511; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10512; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10513; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10514; GFX12-CU-NEXT: s_endpgm 10515 ptr %in, ptr %out) { 10516entry: 10517 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 10518 store i32 %val, ptr %out 10519 ret void 10520} 10521 10522define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( 10523; GFX7-LABEL: flat_workgroup_one_as_unordered_store: 10524; GFX7: ; %bb.0: ; %entry 10525; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10526; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10527; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10528; GFX7-NEXT: v_mov_b32_e32 v0, s6 10529; GFX7-NEXT: v_mov_b32_e32 v1, s7 10530; GFX7-NEXT: v_mov_b32_e32 v2, s4 10531; GFX7-NEXT: flat_store_dword v[0:1], v2 10532; GFX7-NEXT: s_endpgm 10533; 10534; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: 10535; GFX10-WGP: ; %bb.0: ; %entry 10536; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10537; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10538; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10539; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10540; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10541; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10542; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10543; GFX10-WGP-NEXT: s_endpgm 10544; 10545; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: 10546; GFX10-CU: ; %bb.0: ; %entry 10547; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10548; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10549; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10550; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10551; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10552; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10553; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10554; GFX10-CU-NEXT: s_endpgm 10555; 10556; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: 10557; SKIP-CACHE-INV: ; %bb.0: ; %entry 10558; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10559; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10560; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10561; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10562; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10563; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10564; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10565; SKIP-CACHE-INV-NEXT: s_endpgm 10566; 10567; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: 10568; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10569; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10570; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10571; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10572; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10573; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10574; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10575; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10576; 10577; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: 10578; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10579; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10580; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10581; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10582; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10583; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10584; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10585; GFX90A-TGSPLIT-NEXT: s_endpgm 10586; 10587; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: 10588; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10589; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10590; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10591; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10592; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10593; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10594; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10595; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10596; 10597; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: 10598; GFX940-TGSPLIT: ; %bb.0: ; %entry 10599; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10600; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10601; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10602; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10603; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10604; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10605; GFX940-TGSPLIT-NEXT: s_endpgm 10606; 10607; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store: 10608; GFX11-WGP: ; %bb.0: ; %entry 10609; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10610; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10611; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10612; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10613; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10614; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10615; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10616; GFX11-WGP-NEXT: s_endpgm 10617; 10618; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store: 10619; GFX11-CU: ; %bb.0: ; %entry 10620; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10621; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10622; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10623; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10624; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10625; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10626; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10627; GFX11-CU-NEXT: s_endpgm 10628; 10629; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store: 10630; GFX12-WGP: ; %bb.0: ; %entry 10631; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10632; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10633; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10634; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10635; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10636; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10637; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10638; GFX12-WGP-NEXT: s_endpgm 10639; 10640; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store: 10641; GFX12-CU: ; %bb.0: ; %entry 10642; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10643; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10644; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10645; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10646; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10647; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10648; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10649; GFX12-CU-NEXT: s_endpgm 10650 i32 %in, ptr %out) { 10651entry: 10652 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 10653 ret void 10654} 10655 10656define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( 10657; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: 10658; GFX7: ; %bb.0: ; %entry 10659; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10660; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10661; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10662; GFX7-NEXT: v_mov_b32_e32 v0, s6 10663; GFX7-NEXT: v_mov_b32_e32 v1, s7 10664; GFX7-NEXT: v_mov_b32_e32 v2, s4 10665; GFX7-NEXT: flat_store_dword v[0:1], v2 10666; GFX7-NEXT: s_endpgm 10667; 10668; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: 10669; GFX10-WGP: ; %bb.0: ; %entry 10670; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10671; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10672; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10673; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10674; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10675; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10676; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10677; GFX10-WGP-NEXT: s_endpgm 10678; 10679; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: 10680; GFX10-CU: ; %bb.0: ; %entry 10681; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10682; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10683; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10684; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10685; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10686; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10687; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10688; GFX10-CU-NEXT: s_endpgm 10689; 10690; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: 10691; SKIP-CACHE-INV: ; %bb.0: ; %entry 10692; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10693; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10694; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10695; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10696; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10697; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10698; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10699; SKIP-CACHE-INV-NEXT: s_endpgm 10700; 10701; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: 10702; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10703; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10704; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10705; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10706; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10707; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10708; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10709; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10710; 10711; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: 10712; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10713; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10714; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10715; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10716; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10717; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10718; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10719; GFX90A-TGSPLIT-NEXT: s_endpgm 10720; 10721; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: 10722; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10723; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10724; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10725; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10726; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10727; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10728; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10729; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10730; 10731; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: 10732; GFX940-TGSPLIT: ; %bb.0: ; %entry 10733; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10734; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10735; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10736; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10737; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10738; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10739; GFX940-TGSPLIT-NEXT: s_endpgm 10740; 10741; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store: 10742; GFX11-WGP: ; %bb.0: ; %entry 10743; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10744; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10745; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10746; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10747; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10748; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10749; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10750; GFX11-WGP-NEXT: s_endpgm 10751; 10752; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store: 10753; GFX11-CU: ; %bb.0: ; %entry 10754; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10755; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10756; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10757; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10758; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10759; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10760; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10761; GFX11-CU-NEXT: s_endpgm 10762; 10763; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store: 10764; GFX12-WGP: ; %bb.0: ; %entry 10765; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10766; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10767; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10768; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10769; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10770; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10771; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 10772; GFX12-WGP-NEXT: s_endpgm 10773; 10774; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store: 10775; GFX12-CU: ; %bb.0: ; %entry 10776; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10777; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10778; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10779; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10780; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10781; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10782; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10783; GFX12-CU-NEXT: s_endpgm 10784 i32 %in, ptr %out) { 10785entry: 10786 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 10787 ret void 10788} 10789 10790define amdgpu_kernel void @flat_workgroup_one_as_release_store( 10791; GFX7-LABEL: flat_workgroup_one_as_release_store: 10792; GFX7: ; %bb.0: ; %entry 10793; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10794; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10795; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10796; GFX7-NEXT: v_mov_b32_e32 v0, s6 10797; GFX7-NEXT: v_mov_b32_e32 v1, s7 10798; GFX7-NEXT: v_mov_b32_e32 v2, s4 10799; GFX7-NEXT: flat_store_dword v[0:1], v2 10800; GFX7-NEXT: s_endpgm 10801; 10802; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: 10803; GFX10-WGP: ; %bb.0: ; %entry 10804; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10805; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10806; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10807; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10808; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10809; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10810; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 10811; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 10812; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10813; GFX10-WGP-NEXT: s_endpgm 10814; 10815; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: 10816; GFX10-CU: ; %bb.0: ; %entry 10817; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10818; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10819; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10820; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10821; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10822; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10823; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10824; GFX10-CU-NEXT: s_endpgm 10825; 10826; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: 10827; SKIP-CACHE-INV: ; %bb.0: ; %entry 10828; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10829; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10830; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10831; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10832; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10833; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10834; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10835; SKIP-CACHE-INV-NEXT: s_endpgm 10836; 10837; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: 10838; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10839; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10840; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10841; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10842; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10843; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10844; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10845; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10846; 10847; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: 10848; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10849; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10850; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10851; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10852; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10853; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10854; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10855; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10856; GFX90A-TGSPLIT-NEXT: s_endpgm 10857; 10858; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: 10859; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10860; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10861; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10862; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10863; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10864; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10865; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10866; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10867; 10868; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: 10869; GFX940-TGSPLIT: ; %bb.0: ; %entry 10870; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10871; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10872; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10873; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10874; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10875; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10876; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10877; GFX940-TGSPLIT-NEXT: s_endpgm 10878; 10879; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store: 10880; GFX11-WGP: ; %bb.0: ; %entry 10881; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10882; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10883; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10884; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10885; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10886; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10887; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 10888; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 10889; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10890; GFX11-WGP-NEXT: s_endpgm 10891; 10892; GFX11-CU-LABEL: flat_workgroup_one_as_release_store: 10893; GFX11-CU: ; %bb.0: ; %entry 10894; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10895; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10896; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10897; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10898; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10899; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10900; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10901; GFX11-CU-NEXT: s_endpgm 10902; 10903; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store: 10904; GFX12-WGP: ; %bb.0: ; %entry 10905; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10906; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10907; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10908; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10909; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10910; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10911; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 10912; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 10913; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 10914; GFX12-WGP-NEXT: s_wait_storecnt 0x0 10915; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 10916; GFX12-WGP-NEXT: s_endpgm 10917; 10918; GFX12-CU-LABEL: flat_workgroup_one_as_release_store: 10919; GFX12-CU: ; %bb.0: ; %entry 10920; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10921; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10922; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10923; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10924; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10925; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10926; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10927; GFX12-CU-NEXT: s_endpgm 10928 i32 %in, ptr %out) { 10929entry: 10930 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 10931 ret void 10932} 10933 10934define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( 10935; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: 10936; GFX7: ; %bb.0: ; %entry 10937; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10938; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10939; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10940; GFX7-NEXT: v_mov_b32_e32 v0, s6 10941; GFX7-NEXT: v_mov_b32_e32 v1, s7 10942; GFX7-NEXT: v_mov_b32_e32 v2, s4 10943; GFX7-NEXT: flat_store_dword v[0:1], v2 10944; GFX7-NEXT: s_endpgm 10945; 10946; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: 10947; GFX10-WGP: ; %bb.0: ; %entry 10948; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10949; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10950; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10951; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10952; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10953; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10954; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 10955; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 10956; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10957; GFX10-WGP-NEXT: s_endpgm 10958; 10959; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: 10960; GFX10-CU: ; %bb.0: ; %entry 10961; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10962; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10963; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10964; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10965; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10966; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10967; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10968; GFX10-CU-NEXT: s_endpgm 10969; 10970; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: 10971; SKIP-CACHE-INV: ; %bb.0: ; %entry 10972; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10973; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10974; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10975; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10976; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10977; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10978; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10979; SKIP-CACHE-INV-NEXT: s_endpgm 10980; 10981; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: 10982; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10983; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10984; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10985; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10986; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10987; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10988; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10989; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10990; 10991; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: 10992; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10993; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10994; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10995; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10996; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10997; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10998; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10999; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11000; GFX90A-TGSPLIT-NEXT: s_endpgm 11001; 11002; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: 11003; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11004; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 11005; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11006; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11007; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11008; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11009; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11010; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11011; 11012; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: 11013; GFX940-TGSPLIT: ; %bb.0: ; %entry 11014; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 11015; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 11016; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11017; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11018; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11019; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11020; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11021; GFX940-TGSPLIT-NEXT: s_endpgm 11022; 11023; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: 11024; GFX11-WGP: ; %bb.0: ; %entry 11025; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 11026; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 11027; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11028; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11029; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11030; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11031; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 11032; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11033; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11034; GFX11-WGP-NEXT: s_endpgm 11035; 11036; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store: 11037; GFX11-CU: ; %bb.0: ; %entry 11038; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 11039; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 11040; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11041; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11042; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11043; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11044; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11045; GFX11-CU-NEXT: s_endpgm 11046; 11047; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: 11048; GFX12-WGP: ; %bb.0: ; %entry 11049; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 11050; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 11051; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11052; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11053; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11054; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11055; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 11056; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 11057; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 11058; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11059; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE 11060; GFX12-WGP-NEXT: s_endpgm 11061; 11062; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store: 11063; GFX12-CU: ; %bb.0: ; %entry 11064; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 11065; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 11066; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11067; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11068; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11069; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11070; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 11071; GFX12-CU-NEXT: s_endpgm 11072 i32 %in, ptr %out) { 11073entry: 11074 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 11075 ret void 11076} 11077 11078define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( 11079; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11080; GFX7: ; %bb.0: ; %entry 11081; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11082; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11083; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11084; GFX7-NEXT: v_mov_b32_e32 v0, s6 11085; GFX7-NEXT: v_mov_b32_e32 v1, s7 11086; GFX7-NEXT: v_mov_b32_e32 v2, s4 11087; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11088; GFX7-NEXT: s_endpgm 11089; 11090; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11091; GFX10-WGP: ; %bb.0: ; %entry 11092; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11093; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11094; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11095; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11096; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11097; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11098; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11099; GFX10-WGP-NEXT: s_endpgm 11100; 11101; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11102; GFX10-CU: ; %bb.0: ; %entry 11103; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11104; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11105; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11106; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11107; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11108; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11109; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11110; GFX10-CU-NEXT: s_endpgm 11111; 11112; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11113; SKIP-CACHE-INV: ; %bb.0: ; %entry 11114; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11115; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11116; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11117; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11118; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11119; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11120; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11121; SKIP-CACHE-INV-NEXT: s_endpgm 11122; 11123; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11124; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11125; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11126; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11127; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11128; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11129; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11130; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11131; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11132; 11133; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11134; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11135; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11136; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11137; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11138; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11139; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11140; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11141; GFX90A-TGSPLIT-NEXT: s_endpgm 11142; 11143; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11144; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11145; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11146; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11147; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11148; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11149; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11150; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11151; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11152; 11153; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11154; GFX940-TGSPLIT: ; %bb.0: ; %entry 11155; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11156; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11157; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11158; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11159; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11160; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11161; GFX940-TGSPLIT-NEXT: s_endpgm 11162; 11163; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11164; GFX11-WGP: ; %bb.0: ; %entry 11165; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11166; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11167; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11168; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11169; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11170; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11171; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11172; GFX11-WGP-NEXT: s_endpgm 11173; 11174; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11175; GFX11-CU: ; %bb.0: ; %entry 11176; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11177; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11178; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11179; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11180; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11181; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11182; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11183; GFX11-CU-NEXT: s_endpgm 11184; 11185; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11186; GFX12-WGP: ; %bb.0: ; %entry 11187; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11188; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11189; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11190; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11191; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11192; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11193; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 11194; GFX12-WGP-NEXT: s_endpgm 11195; 11196; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: 11197; GFX12-CU: ; %bb.0: ; %entry 11198; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11199; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11200; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11201; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11202; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11203; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11204; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11205; GFX12-CU-NEXT: s_endpgm 11206 ptr %out, i32 %in) { 11207entry: 11208 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic 11209 ret void 11210} 11211 11212define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( 11213; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11214; GFX7: ; %bb.0: ; %entry 11215; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11216; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11217; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11218; GFX7-NEXT: v_mov_b32_e32 v0, s6 11219; GFX7-NEXT: v_mov_b32_e32 v1, s7 11220; GFX7-NEXT: v_mov_b32_e32 v2, s4 11221; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11222; GFX7-NEXT: s_endpgm 11223; 11224; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11225; GFX10-WGP: ; %bb.0: ; %entry 11226; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11227; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11228; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11229; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11230; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11231; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11232; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11233; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11234; GFX10-WGP-NEXT: buffer_gl0_inv 11235; GFX10-WGP-NEXT: s_endpgm 11236; 11237; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11238; GFX10-CU: ; %bb.0: ; %entry 11239; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11240; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11241; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11242; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11243; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11244; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11245; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11246; GFX10-CU-NEXT: s_endpgm 11247; 11248; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11249; SKIP-CACHE-INV: ; %bb.0: ; %entry 11250; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11251; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11252; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11253; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11255; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11256; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11257; SKIP-CACHE-INV-NEXT: s_endpgm 11258; 11259; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11260; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11261; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11262; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11263; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11264; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11265; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11266; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11267; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11268; 11269; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11270; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11271; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11272; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11273; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11274; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11275; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11276; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11277; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11278; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 11279; GFX90A-TGSPLIT-NEXT: s_endpgm 11280; 11281; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11282; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11283; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11284; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11285; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11286; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11287; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11288; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11289; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11290; 11291; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11292; GFX940-TGSPLIT: ; %bb.0: ; %entry 11293; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11294; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11295; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11296; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11297; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11298; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11299; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11300; GFX940-TGSPLIT-NEXT: buffer_inv sc0 11301; GFX940-TGSPLIT-NEXT: s_endpgm 11302; 11303; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11304; GFX11-WGP: ; %bb.0: ; %entry 11305; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11306; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11307; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11308; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11309; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11310; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11311; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11312; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11313; GFX11-WGP-NEXT: buffer_gl0_inv 11314; GFX11-WGP-NEXT: s_endpgm 11315; 11316; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11317; GFX11-CU: ; %bb.0: ; %entry 11318; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11319; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11320; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11321; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11322; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11323; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11324; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11325; GFX11-CU-NEXT: s_endpgm 11326; 11327; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11328; GFX12-WGP: ; %bb.0: ; %entry 11329; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11330; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11331; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11332; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11333; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11334; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11335; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 11336; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11337; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 11338; GFX12-WGP-NEXT: s_endpgm 11339; 11340; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: 11341; GFX12-CU: ; %bb.0: ; %entry 11342; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11343; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11344; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11345; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11346; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11347; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11348; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11349; GFX12-CU-NEXT: s_endpgm 11350 ptr %out, i32 %in) { 11351entry: 11352 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire 11353 ret void 11354} 11355 11356define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( 11357; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: 11358; GFX7: ; %bb.0: ; %entry 11359; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11360; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11361; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11362; GFX7-NEXT: v_mov_b32_e32 v0, s6 11363; GFX7-NEXT: v_mov_b32_e32 v1, s7 11364; GFX7-NEXT: v_mov_b32_e32 v2, s4 11365; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11366; GFX7-NEXT: s_endpgm 11367; 11368; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: 11369; GFX10-WGP: ; %bb.0: ; %entry 11370; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11371; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11372; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11373; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11374; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11375; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11376; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 11377; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11378; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11379; GFX10-WGP-NEXT: s_endpgm 11380; 11381; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: 11382; GFX10-CU: ; %bb.0: ; %entry 11383; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11384; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11385; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11386; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11387; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11388; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11389; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11390; GFX10-CU-NEXT: s_endpgm 11391; 11392; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw: 11393; SKIP-CACHE-INV: ; %bb.0: ; %entry 11394; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11395; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11396; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11397; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11398; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11399; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11400; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11401; SKIP-CACHE-INV-NEXT: s_endpgm 11402; 11403; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: 11404; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11405; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11406; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11407; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11408; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11409; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11410; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11411; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11412; 11413; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: 11414; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11415; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11416; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11417; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11418; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11419; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11420; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11421; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11422; GFX90A-TGSPLIT-NEXT: s_endpgm 11423; 11424; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: 11425; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11426; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11427; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11428; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11429; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11430; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11431; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11432; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11433; 11434; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: 11435; GFX940-TGSPLIT: ; %bb.0: ; %entry 11436; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11437; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11438; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11439; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11440; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11441; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11442; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11443; GFX940-TGSPLIT-NEXT: s_endpgm 11444; 11445; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: 11446; GFX11-WGP: ; %bb.0: ; %entry 11447; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11448; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11449; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11450; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11451; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11452; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11453; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 11454; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11455; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11456; GFX11-WGP-NEXT: s_endpgm 11457; 11458; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: 11459; GFX11-CU: ; %bb.0: ; %entry 11460; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11461; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11462; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11463; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11464; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11465; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11466; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11467; GFX11-CU-NEXT: s_endpgm 11468; 11469; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: 11470; GFX12-WGP: ; %bb.0: ; %entry 11471; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11472; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11473; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11474; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11475; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11476; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11477; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 11478; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 11479; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 11480; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11481; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 11482; GFX12-WGP-NEXT: s_endpgm 11483; 11484; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: 11485; GFX12-CU: ; %bb.0: ; %entry 11486; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11487; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11488; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11489; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11490; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11491; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11492; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11493; GFX12-CU-NEXT: s_endpgm 11494 ptr %out, i32 %in) { 11495entry: 11496 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release 11497 ret void 11498} 11499 11500define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( 11501; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11502; GFX7: ; %bb.0: ; %entry 11503; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11504; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11505; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11506; GFX7-NEXT: v_mov_b32_e32 v0, s6 11507; GFX7-NEXT: v_mov_b32_e32 v1, s7 11508; GFX7-NEXT: v_mov_b32_e32 v2, s4 11509; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11510; GFX7-NEXT: s_endpgm 11511; 11512; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11513; GFX10-WGP: ; %bb.0: ; %entry 11514; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11515; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11516; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11517; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11518; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11519; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11520; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 11521; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11522; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11523; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11524; GFX10-WGP-NEXT: buffer_gl0_inv 11525; GFX10-WGP-NEXT: s_endpgm 11526; 11527; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11528; GFX10-CU: ; %bb.0: ; %entry 11529; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11530; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11531; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11532; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11533; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11534; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11535; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11536; GFX10-CU-NEXT: s_endpgm 11537; 11538; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11539; SKIP-CACHE-INV: ; %bb.0: ; %entry 11540; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11541; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11542; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11543; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11544; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11545; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11546; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11547; SKIP-CACHE-INV-NEXT: s_endpgm 11548; 11549; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11550; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11551; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11552; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11553; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11554; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11555; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11556; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11557; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11558; 11559; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11560; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11561; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11562; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11563; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11564; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11565; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11566; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11567; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11568; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11569; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 11570; GFX90A-TGSPLIT-NEXT: s_endpgm 11571; 11572; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11573; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11574; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11575; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11576; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11577; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11578; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11579; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11580; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11581; 11582; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11583; GFX940-TGSPLIT: ; %bb.0: ; %entry 11584; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11585; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11586; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11587; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11588; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11589; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11590; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11591; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11592; GFX940-TGSPLIT-NEXT: buffer_inv sc0 11593; GFX940-TGSPLIT-NEXT: s_endpgm 11594; 11595; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11596; GFX11-WGP: ; %bb.0: ; %entry 11597; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11598; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11599; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11600; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11601; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11602; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11603; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 11604; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11605; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11606; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11607; GFX11-WGP-NEXT: buffer_gl0_inv 11608; GFX11-WGP-NEXT: s_endpgm 11609; 11610; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11611; GFX11-CU: ; %bb.0: ; %entry 11612; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11613; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11614; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11615; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11616; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11617; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11618; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11619; GFX11-CU-NEXT: s_endpgm 11620; 11621; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11622; GFX12-WGP: ; %bb.0: ; %entry 11623; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11624; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11625; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11626; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11627; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11628; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11629; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 11630; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 11631; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 11632; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11633; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 11634; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11635; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 11636; GFX12-WGP-NEXT: s_endpgm 11637; 11638; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: 11639; GFX12-CU: ; %bb.0: ; %entry 11640; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11641; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11642; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11643; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11644; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11645; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11646; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11647; GFX12-CU-NEXT: s_endpgm 11648 ptr %out, i32 %in) { 11649entry: 11650 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel 11651 ret void 11652} 11653 11654define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( 11655; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11656; GFX7: ; %bb.0: ; %entry 11657; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11658; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11659; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11660; GFX7-NEXT: v_mov_b32_e32 v0, s6 11661; GFX7-NEXT: v_mov_b32_e32 v1, s7 11662; GFX7-NEXT: v_mov_b32_e32 v2, s4 11663; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11664; GFX7-NEXT: s_endpgm 11665; 11666; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11667; GFX10-WGP: ; %bb.0: ; %entry 11668; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11669; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11670; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11671; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11672; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11673; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11674; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 11675; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11676; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11677; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11678; GFX10-WGP-NEXT: buffer_gl0_inv 11679; GFX10-WGP-NEXT: s_endpgm 11680; 11681; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11682; GFX10-CU: ; %bb.0: ; %entry 11683; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11684; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11685; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11686; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11687; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11688; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11689; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11690; GFX10-CU-NEXT: s_endpgm 11691; 11692; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11693; SKIP-CACHE-INV: ; %bb.0: ; %entry 11694; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11695; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11696; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11697; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11698; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11699; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11700; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11701; SKIP-CACHE-INV-NEXT: s_endpgm 11702; 11703; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11704; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11705; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11706; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11707; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11708; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11709; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11710; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11711; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11712; 11713; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11714; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11715; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11716; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11717; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11718; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11719; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11720; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11721; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11722; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11723; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 11724; GFX90A-TGSPLIT-NEXT: s_endpgm 11725; 11726; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11727; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11728; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11729; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11730; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11731; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11732; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11733; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11734; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11735; 11736; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11737; GFX940-TGSPLIT: ; %bb.0: ; %entry 11738; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11739; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11740; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11741; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11742; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11743; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11744; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11745; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11746; GFX940-TGSPLIT-NEXT: buffer_inv sc0 11747; GFX940-TGSPLIT-NEXT: s_endpgm 11748; 11749; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11750; GFX11-WGP: ; %bb.0: ; %entry 11751; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11752; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11753; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11754; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11755; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11756; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11757; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 11758; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11759; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11760; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 11761; GFX11-WGP-NEXT: buffer_gl0_inv 11762; GFX11-WGP-NEXT: s_endpgm 11763; 11764; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11765; GFX11-CU: ; %bb.0: ; %entry 11766; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11767; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11768; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11769; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11770; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11771; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11772; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11773; GFX11-CU-NEXT: s_endpgm 11774; 11775; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11776; GFX12-WGP: ; %bb.0: ; %entry 11777; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11778; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11779; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11780; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11781; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11782; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11783; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 11784; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 11785; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 11786; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11787; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE 11788; GFX12-WGP-NEXT: s_wait_storecnt 0x0 11789; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 11790; GFX12-WGP-NEXT: s_endpgm 11791; 11792; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: 11793; GFX12-CU: ; %bb.0: ; %entry 11794; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11795; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11796; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11797; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11798; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11799; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11800; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11801; GFX12-CU-NEXT: s_endpgm 11802 ptr %out, i32 %in) { 11803entry: 11804 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst 11805 ret void 11806} 11807 11808define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( 11809; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11810; GFX7: ; %bb.0: ; %entry 11811; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11812; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 11813; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11814; GFX7-NEXT: v_mov_b32_e32 v0, s4 11815; GFX7-NEXT: v_mov_b32_e32 v1, s5 11816; GFX7-NEXT: v_mov_b32_e32 v2, s6 11817; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11818; GFX7-NEXT: v_mov_b32_e32 v0, s4 11819; GFX7-NEXT: v_mov_b32_e32 v1, s5 11820; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11821; GFX7-NEXT: flat_store_dword v[0:1], v2 11822; GFX7-NEXT: s_endpgm 11823; 11824; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11825; GFX10-WGP: ; %bb.0: ; %entry 11826; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11827; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 11828; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11829; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11830; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11831; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 11832; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11833; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 11834; GFX10-WGP-NEXT: buffer_gl0_inv 11835; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11836; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11837; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11838; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11839; GFX10-WGP-NEXT: s_endpgm 11840; 11841; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11842; GFX10-CU: ; %bb.0: ; %entry 11843; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11844; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 11845; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11846; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11847; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11848; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 11849; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11850; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11851; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11852; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11853; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11854; GFX10-CU-NEXT: s_endpgm 11855; 11856; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11857; SKIP-CACHE-INV: ; %bb.0: ; %entry 11858; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11859; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 11860; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11862; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11863; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 11864; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11865; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11866; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11867; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11868; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11869; SKIP-CACHE-INV-NEXT: s_endpgm 11870; 11871; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11872; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11873; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11874; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11875; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11876; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11877; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11878; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11879; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11880; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11881; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11882; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11883; 11884; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11885; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11886; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11887; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11888; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11889; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11890; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11891; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11892; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11893; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 11894; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11895; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11896; GFX90A-TGSPLIT-NEXT: s_endpgm 11897; 11898; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11899; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11900; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11901; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11902; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11903; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11904; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11905; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11906; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11907; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11908; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11909; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11910; 11911; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11912; GFX940-TGSPLIT: ; %bb.0: ; %entry 11913; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11914; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11915; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11916; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11917; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11918; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11919; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11920; GFX940-TGSPLIT-NEXT: buffer_inv sc0 11921; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11922; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11923; GFX940-TGSPLIT-NEXT: s_endpgm 11924; 11925; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11926; GFX11-WGP: ; %bb.0: ; %entry 11927; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11928; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 11929; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11930; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11931; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11932; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 11933; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11934; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 11935; GFX11-WGP-NEXT: buffer_gl0_inv 11936; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11937; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11938; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11939; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11940; GFX11-WGP-NEXT: s_endpgm 11941; 11942; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11943; GFX11-CU: ; %bb.0: ; %entry 11944; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11945; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 11946; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11947; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11948; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11949; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 11950; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11951; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11952; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11953; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11954; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11955; GFX11-CU-NEXT: s_endpgm 11956; 11957; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11958; GFX12-WGP: ; %bb.0: ; %entry 11959; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11960; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 11961; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11962; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 11963; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 11964; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 11965; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 11966; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 11967; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 11968; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 11969; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 11970; GFX12-WGP-NEXT: s_wait_dscnt 0x0 11971; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 11972; GFX12-WGP-NEXT: s_endpgm 11973; 11974; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: 11975; GFX12-CU: ; %bb.0: ; %entry 11976; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11977; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 11978; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11979; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 11980; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 11981; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 11982; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 11983; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 11984; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 11985; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 11986; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 11987; GFX12-CU-NEXT: s_endpgm 11988 ptr %out, i32 %in) { 11989entry: 11990 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire 11991 store i32 %val, ptr %out, align 4 11992 ret void 11993} 11994 11995define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( 11996; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 11997; GFX7: ; %bb.0: ; %entry 11998; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11999; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 12000; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12001; GFX7-NEXT: v_mov_b32_e32 v0, s4 12002; GFX7-NEXT: v_mov_b32_e32 v1, s5 12003; GFX7-NEXT: v_mov_b32_e32 v2, s6 12004; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12005; GFX7-NEXT: v_mov_b32_e32 v0, s4 12006; GFX7-NEXT: v_mov_b32_e32 v1, s5 12007; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12008; GFX7-NEXT: flat_store_dword v[0:1], v2 12009; GFX7-NEXT: s_endpgm 12010; 12011; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12012; GFX10-WGP: ; %bb.0: ; %entry 12013; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12014; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 12015; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12016; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12017; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12018; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 12019; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 12020; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12021; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12022; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 12023; GFX10-WGP-NEXT: buffer_gl0_inv 12024; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12025; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12026; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12027; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12028; GFX10-WGP-NEXT: s_endpgm 12029; 12030; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12031; GFX10-CU: ; %bb.0: ; %entry 12032; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12033; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 12034; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12035; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12036; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12037; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 12038; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12039; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12040; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12041; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12042; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12043; GFX10-CU-NEXT: s_endpgm 12044; 12045; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12046; SKIP-CACHE-INV: ; %bb.0: ; %entry 12047; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12048; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 12049; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12050; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12051; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12052; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 12053; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12054; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12055; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12056; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12057; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12058; SKIP-CACHE-INV-NEXT: s_endpgm 12059; 12060; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12061; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12062; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12063; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12064; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12065; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12066; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12067; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12068; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12069; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12070; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12071; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12072; 12073; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12074; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12075; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12076; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12077; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12078; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12079; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12080; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12081; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12082; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12083; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 12084; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12085; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12086; GFX90A-TGSPLIT-NEXT: s_endpgm 12087; 12088; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12089; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12090; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12091; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12092; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12093; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12094; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12095; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12096; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12097; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12098; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12099; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12100; 12101; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12102; GFX940-TGSPLIT: ; %bb.0: ; %entry 12103; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12104; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12105; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12106; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12107; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12108; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12109; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12110; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12111; GFX940-TGSPLIT-NEXT: buffer_inv sc0 12112; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12113; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12114; GFX940-TGSPLIT-NEXT: s_endpgm 12115; 12116; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12117; GFX11-WGP: ; %bb.0: ; %entry 12118; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12119; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12120; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12121; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12122; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12123; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 12124; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 12125; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12126; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12127; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 12128; GFX11-WGP-NEXT: buffer_gl0_inv 12129; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12130; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12131; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12132; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12133; GFX11-WGP-NEXT: s_endpgm 12134; 12135; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12136; GFX11-CU: ; %bb.0: ; %entry 12137; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12138; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12139; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12140; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12141; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12142; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 12143; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12144; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12145; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12146; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12147; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12148; GFX11-CU-NEXT: s_endpgm 12149; 12150; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12151; GFX12-WGP: ; %bb.0: ; %entry 12152; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12153; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12154; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12155; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12156; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12157; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 12158; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 12159; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 12160; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 12161; GFX12-WGP-NEXT: s_wait_storecnt 0x0 12162; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 12163; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 12164; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 12165; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 12166; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 12167; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12168; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12169; GFX12-WGP-NEXT: s_wait_dscnt 0x0 12170; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 12171; GFX12-WGP-NEXT: s_endpgm 12172; 12173; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: 12174; GFX12-CU: ; %bb.0: ; %entry 12175; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12176; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12177; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12178; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12179; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12180; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 12181; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12182; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12183; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12184; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 12185; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 12186; GFX12-CU-NEXT: s_endpgm 12187 ptr %out, i32 %in) { 12188entry: 12189 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel 12190 store i32 %val, ptr %out, align 4 12191 ret void 12192} 12193 12194define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( 12195; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12196; GFX7: ; %bb.0: ; %entry 12197; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12198; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 12199; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12200; GFX7-NEXT: v_mov_b32_e32 v0, s4 12201; GFX7-NEXT: v_mov_b32_e32 v1, s5 12202; GFX7-NEXT: v_mov_b32_e32 v2, s6 12203; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12204; GFX7-NEXT: v_mov_b32_e32 v0, s4 12205; GFX7-NEXT: v_mov_b32_e32 v1, s5 12206; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12207; GFX7-NEXT: flat_store_dword v[0:1], v2 12208; GFX7-NEXT: s_endpgm 12209; 12210; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12211; GFX10-WGP: ; %bb.0: ; %entry 12212; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12213; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 12214; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12215; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12216; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12217; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 12218; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 12219; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12220; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12221; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 12222; GFX10-WGP-NEXT: buffer_gl0_inv 12223; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12224; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12225; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12226; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12227; GFX10-WGP-NEXT: s_endpgm 12228; 12229; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12230; GFX10-CU: ; %bb.0: ; %entry 12231; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12232; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 12233; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12234; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12235; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12236; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 12237; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12238; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12239; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12240; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12241; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12242; GFX10-CU-NEXT: s_endpgm 12243; 12244; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12245; SKIP-CACHE-INV: ; %bb.0: ; %entry 12246; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12247; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 12248; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 12252; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12253; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12255; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12256; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12257; SKIP-CACHE-INV-NEXT: s_endpgm 12258; 12259; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12260; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12261; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12262; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12263; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12264; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12265; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12266; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12267; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12268; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12269; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12270; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12271; 12272; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12273; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12274; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12275; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12276; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12277; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12278; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12279; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12280; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12281; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12282; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 12283; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12284; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12285; GFX90A-TGSPLIT-NEXT: s_endpgm 12286; 12287; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12288; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12289; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12290; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12291; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12292; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12293; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12294; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12295; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12296; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12297; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12298; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12299; 12300; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12301; GFX940-TGSPLIT: ; %bb.0: ; %entry 12302; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12303; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12304; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12305; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12306; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12307; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12308; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12309; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12310; GFX940-TGSPLIT-NEXT: buffer_inv sc0 12311; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12312; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12313; GFX940-TGSPLIT-NEXT: s_endpgm 12314; 12315; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12316; GFX11-WGP: ; %bb.0: ; %entry 12317; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12318; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12319; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12320; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12321; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12322; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 12323; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 12324; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12325; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12326; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 12327; GFX11-WGP-NEXT: buffer_gl0_inv 12328; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12329; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12330; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12331; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12332; GFX11-WGP-NEXT: s_endpgm 12333; 12334; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12335; GFX11-CU: ; %bb.0: ; %entry 12336; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12337; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12338; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12339; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12340; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12341; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 12342; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12343; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12344; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12345; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12346; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12347; GFX11-CU-NEXT: s_endpgm 12348; 12349; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12350; GFX12-WGP: ; %bb.0: ; %entry 12351; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12352; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12353; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12354; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12355; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12356; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 12357; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 12358; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 12359; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 12360; GFX12-WGP-NEXT: s_wait_storecnt 0x0 12361; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE 12362; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 12363; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 12364; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 12365; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 12366; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12367; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12368; GFX12-WGP-NEXT: s_wait_dscnt 0x0 12369; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 12370; GFX12-WGP-NEXT: s_endpgm 12371; 12372; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: 12373; GFX12-CU: ; %bb.0: ; %entry 12374; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12375; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12376; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12377; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12378; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12379; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 12380; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12381; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12382; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12383; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 12384; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 12385; GFX12-CU-NEXT: s_endpgm 12386 ptr %out, i32 %in) { 12387entry: 12388 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst 12389 store i32 %val, ptr %out, align 4 12390 ret void 12391} 12392 12393define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( 12394; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12395; GFX7: ; %bb.0: ; %entry 12396; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12397; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12398; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12399; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12400; GFX7-NEXT: s_mov_b64 s[10:11], 16 12401; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12402; GFX7-NEXT: s_mov_b32 s4, s8 12403; GFX7-NEXT: s_mov_b32 s5, s9 12404; GFX7-NEXT: s_mov_b32 s9, s10 12405; GFX7-NEXT: s_mov_b32 s8, s11 12406; GFX7-NEXT: s_add_u32 s4, s4, s9 12407; GFX7-NEXT: s_addc_u32 s8, s5, s8 12408; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12409; GFX7-NEXT: s_mov_b32 s5, s8 12410; GFX7-NEXT: v_mov_b32_e32 v2, s7 12411; GFX7-NEXT: v_mov_b32_e32 v0, s6 12412; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12413; GFX7-NEXT: v_mov_b32_e32 v3, v0 12414; GFX7-NEXT: v_mov_b32_e32 v0, s4 12415; GFX7-NEXT: v_mov_b32_e32 v1, s5 12416; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12417; GFX7-NEXT: s_endpgm 12418; 12419; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12420; GFX10-WGP: ; %bb.0: ; %entry 12421; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12422; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12423; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12424; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12425; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12426; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12427; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12428; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12429; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12430; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12431; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12432; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12433; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12434; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12435; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12436; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12437; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12438; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12439; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12440; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12441; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12442; GFX10-WGP-NEXT: s_endpgm 12443; 12444; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12445; GFX10-CU: ; %bb.0: ; %entry 12446; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12447; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12448; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12449; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12450; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12451; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12452; GFX10-CU-NEXT: s_mov_b32 s4, s8 12453; GFX10-CU-NEXT: s_mov_b32 s5, s9 12454; GFX10-CU-NEXT: s_mov_b32 s9, s10 12455; GFX10-CU-NEXT: s_mov_b32 s8, s11 12456; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12457; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12458; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12459; GFX10-CU-NEXT: s_mov_b32 s5, s8 12460; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12461; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12462; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12463; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12464; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12465; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12466; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12467; GFX10-CU-NEXT: s_endpgm 12468; 12469; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12470; SKIP-CACHE-INV: ; %bb.0: ; %entry 12471; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12472; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12473; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12474; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12475; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12476; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12477; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12478; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12479; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12480; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12481; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12482; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12483; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12484; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12485; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12486; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12487; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12488; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12491; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12492; SKIP-CACHE-INV-NEXT: s_endpgm 12493; 12494; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12495; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12496; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12497; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12498; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12499; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12500; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12501; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12502; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12503; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12504; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12505; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12506; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12507; 12508; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12509; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12510; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12511; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12512; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12513; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12514; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12515; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12516; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12517; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12518; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12519; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12520; GFX90A-TGSPLIT-NEXT: s_endpgm 12521; 12522; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12523; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12524; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12525; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12526; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12527; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12528; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12529; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12530; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12531; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12532; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12533; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12534; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12535; 12536; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12537; GFX940-TGSPLIT: ; %bb.0: ; %entry 12538; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12539; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12540; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12541; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12542; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12543; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12544; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12545; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12546; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12547; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12548; GFX940-TGSPLIT-NEXT: s_endpgm 12549; 12550; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12551; GFX11-WGP: ; %bb.0: ; %entry 12552; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12553; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12554; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12555; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12556; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 12557; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 12558; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12559; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 12560; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12561; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12562; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12563; GFX11-WGP-NEXT: s_endpgm 12564; 12565; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12566; GFX11-CU: ; %bb.0: ; %entry 12567; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12568; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12569; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12570; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12571; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 12572; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 12573; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12574; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 12575; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12576; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12577; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12578; GFX11-CU-NEXT: s_endpgm 12579; 12580; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12581; GFX12-WGP: ; %bb.0: ; %entry 12582; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12583; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12584; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12585; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12586; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 12587; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 12588; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12589; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 12590; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12591; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12592; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 12593; GFX12-WGP-NEXT: s_endpgm 12594; 12595; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: 12596; GFX12-CU: ; %bb.0: ; %entry 12597; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12598; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12599; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12600; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12601; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 12602; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 12603; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12604; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 12605; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12606; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12607; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12608; GFX12-CU-NEXT: s_endpgm 12609 ptr %out, i32 %in, i32 %old) { 12610entry: 12611 %gep = getelementptr i32, ptr %out, i32 4 12612 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic 12613 ret void 12614} 12615 12616define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( 12617; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12618; GFX7: ; %bb.0: ; %entry 12619; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12620; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12621; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12622; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12623; GFX7-NEXT: s_mov_b64 s[10:11], 16 12624; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12625; GFX7-NEXT: s_mov_b32 s4, s8 12626; GFX7-NEXT: s_mov_b32 s5, s9 12627; GFX7-NEXT: s_mov_b32 s9, s10 12628; GFX7-NEXT: s_mov_b32 s8, s11 12629; GFX7-NEXT: s_add_u32 s4, s4, s9 12630; GFX7-NEXT: s_addc_u32 s8, s5, s8 12631; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12632; GFX7-NEXT: s_mov_b32 s5, s8 12633; GFX7-NEXT: v_mov_b32_e32 v2, s7 12634; GFX7-NEXT: v_mov_b32_e32 v0, s6 12635; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12636; GFX7-NEXT: v_mov_b32_e32 v3, v0 12637; GFX7-NEXT: v_mov_b32_e32 v0, s4 12638; GFX7-NEXT: v_mov_b32_e32 v1, s5 12639; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12640; GFX7-NEXT: s_endpgm 12641; 12642; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12643; GFX10-WGP: ; %bb.0: ; %entry 12644; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12645; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12646; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12647; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12648; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12649; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12650; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12651; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12652; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12653; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12654; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12655; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12656; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12657; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12658; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12659; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12660; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12661; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12662; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12663; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12664; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12665; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12666; GFX10-WGP-NEXT: buffer_gl0_inv 12667; GFX10-WGP-NEXT: s_endpgm 12668; 12669; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12670; GFX10-CU: ; %bb.0: ; %entry 12671; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12672; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12673; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12674; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12675; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12676; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12677; GFX10-CU-NEXT: s_mov_b32 s4, s8 12678; GFX10-CU-NEXT: s_mov_b32 s5, s9 12679; GFX10-CU-NEXT: s_mov_b32 s9, s10 12680; GFX10-CU-NEXT: s_mov_b32 s8, s11 12681; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12682; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12683; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12684; GFX10-CU-NEXT: s_mov_b32 s5, s8 12685; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12686; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12687; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12688; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12689; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12690; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12691; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12692; GFX10-CU-NEXT: s_endpgm 12693; 12694; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12695; SKIP-CACHE-INV: ; %bb.0: ; %entry 12696; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12697; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12698; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12699; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12700; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12701; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12702; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12703; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12704; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12705; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12706; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12707; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12708; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12709; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12710; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12711; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12712; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12713; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12714; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12715; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12716; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12717; SKIP-CACHE-INV-NEXT: s_endpgm 12718; 12719; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12720; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12721; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12722; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12723; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12724; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12725; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12726; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12727; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12728; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12729; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12730; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12731; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12732; 12733; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12734; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12735; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12736; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12737; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12738; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12739; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12740; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12741; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12742; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12743; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12744; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12745; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12746; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 12747; GFX90A-TGSPLIT-NEXT: s_endpgm 12748; 12749; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12750; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12751; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12752; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12753; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12754; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12755; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12756; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12757; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12758; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12759; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12760; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12761; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12762; 12763; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12764; GFX940-TGSPLIT: ; %bb.0: ; %entry 12765; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12766; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12767; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12768; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12769; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12770; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12771; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12772; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12773; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12774; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12775; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12776; GFX940-TGSPLIT-NEXT: buffer_inv sc0 12777; GFX940-TGSPLIT-NEXT: s_endpgm 12778; 12779; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12780; GFX11-WGP: ; %bb.0: ; %entry 12781; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12782; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12783; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12784; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12785; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 12786; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 12787; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12788; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 12789; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12790; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12791; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12792; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12793; GFX11-WGP-NEXT: buffer_gl0_inv 12794; GFX11-WGP-NEXT: s_endpgm 12795; 12796; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12797; GFX11-CU: ; %bb.0: ; %entry 12798; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12799; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12800; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12801; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12802; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 12803; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 12804; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12805; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 12806; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12807; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12808; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12809; GFX11-CU-NEXT: s_endpgm 12810; 12811; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12812; GFX12-WGP: ; %bb.0: ; %entry 12813; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12814; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12815; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12816; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12817; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 12818; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 12819; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12820; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 12821; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12822; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12823; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 12824; GFX12-WGP-NEXT: s_wait_storecnt 0x0 12825; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 12826; GFX12-WGP-NEXT: s_endpgm 12827; 12828; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: 12829; GFX12-CU: ; %bb.0: ; %entry 12830; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12831; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12832; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12833; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12834; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 12835; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 12836; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12837; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 12838; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12839; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12840; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12841; GFX12-CU-NEXT: s_endpgm 12842 ptr %out, i32 %in, i32 %old) { 12843entry: 12844 %gep = getelementptr i32, ptr %out, i32 4 12845 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 12846 ret void 12847} 12848 12849define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( 12850; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12851; GFX7: ; %bb.0: ; %entry 12852; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12853; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12854; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12855; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12856; GFX7-NEXT: s_mov_b64 s[10:11], 16 12857; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12858; GFX7-NEXT: s_mov_b32 s4, s8 12859; GFX7-NEXT: s_mov_b32 s5, s9 12860; GFX7-NEXT: s_mov_b32 s9, s10 12861; GFX7-NEXT: s_mov_b32 s8, s11 12862; GFX7-NEXT: s_add_u32 s4, s4, s9 12863; GFX7-NEXT: s_addc_u32 s8, s5, s8 12864; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12865; GFX7-NEXT: s_mov_b32 s5, s8 12866; GFX7-NEXT: v_mov_b32_e32 v2, s7 12867; GFX7-NEXT: v_mov_b32_e32 v0, s6 12868; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12869; GFX7-NEXT: v_mov_b32_e32 v3, v0 12870; GFX7-NEXT: v_mov_b32_e32 v0, s4 12871; GFX7-NEXT: v_mov_b32_e32 v1, s5 12872; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12873; GFX7-NEXT: s_endpgm 12874; 12875; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12876; GFX10-WGP: ; %bb.0: ; %entry 12877; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12878; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12879; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12880; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12881; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12882; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12883; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12884; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12885; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12886; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12887; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12888; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12889; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12890; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12891; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12892; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12893; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12894; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12895; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12896; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12897; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 12898; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 12899; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12900; GFX10-WGP-NEXT: s_endpgm 12901; 12902; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12903; GFX10-CU: ; %bb.0: ; %entry 12904; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12905; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12906; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12907; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12908; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12909; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12910; GFX10-CU-NEXT: s_mov_b32 s4, s8 12911; GFX10-CU-NEXT: s_mov_b32 s5, s9 12912; GFX10-CU-NEXT: s_mov_b32 s9, s10 12913; GFX10-CU-NEXT: s_mov_b32 s8, s11 12914; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12915; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12916; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12917; GFX10-CU-NEXT: s_mov_b32 s5, s8 12918; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12919; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12920; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12921; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12922; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12923; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12924; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12925; GFX10-CU-NEXT: s_endpgm 12926; 12927; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12928; SKIP-CACHE-INV: ; %bb.0: ; %entry 12929; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12930; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12931; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12932; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12933; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12934; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12935; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12936; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12937; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12938; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12939; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12940; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12941; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12942; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12943; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12944; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12945; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12946; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12947; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12948; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12949; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12950; SKIP-CACHE-INV-NEXT: s_endpgm 12951; 12952; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12953; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12954; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12955; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12956; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12957; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12958; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12959; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12960; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12961; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12962; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12963; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12964; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12965; 12966; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12967; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12968; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12969; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12970; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12971; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12972; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12973; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12974; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12975; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12976; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12977; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12978; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12979; GFX90A-TGSPLIT-NEXT: s_endpgm 12980; 12981; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12982; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12983; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12984; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12985; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12986; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12987; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12988; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12989; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12990; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12991; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12992; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12993; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12994; 12995; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 12996; GFX940-TGSPLIT: ; %bb.0: ; %entry 12997; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12998; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12999; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13000; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13001; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13002; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13003; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13004; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13005; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13006; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13007; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13008; GFX940-TGSPLIT-NEXT: s_endpgm 13009; 13010; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 13011; GFX11-WGP: ; %bb.0: ; %entry 13012; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13013; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13014; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13015; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13016; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13017; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13018; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13019; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13020; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13021; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13022; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 13023; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13024; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13025; GFX11-WGP-NEXT: s_endpgm 13026; 13027; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 13028; GFX11-CU: ; %bb.0: ; %entry 13029; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13030; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13031; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13032; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13033; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13034; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13035; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13036; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13037; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13038; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13039; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13040; GFX11-CU-NEXT: s_endpgm 13041; 13042; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 13043; GFX12-WGP: ; %bb.0: ; %entry 13044; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13045; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13046; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13047; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13048; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13049; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13050; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13051; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13052; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13053; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13054; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 13055; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 13056; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 13057; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13058; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 13059; GFX12-WGP-NEXT: s_endpgm 13060; 13061; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: 13062; GFX12-CU: ; %bb.0: ; %entry 13063; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13064; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13065; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13066; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13067; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13068; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13069; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13070; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13071; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13072; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13073; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13074; GFX12-CU-NEXT: s_endpgm 13075 ptr %out, i32 %in, i32 %old) { 13076entry: 13077 %gep = getelementptr i32, ptr %out, i32 4 13078 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic 13079 ret void 13080} 13081 13082define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( 13083; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13084; GFX7: ; %bb.0: ; %entry 13085; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13086; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13087; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13088; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13089; GFX7-NEXT: s_mov_b64 s[10:11], 16 13090; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13091; GFX7-NEXT: s_mov_b32 s4, s8 13092; GFX7-NEXT: s_mov_b32 s5, s9 13093; GFX7-NEXT: s_mov_b32 s9, s10 13094; GFX7-NEXT: s_mov_b32 s8, s11 13095; GFX7-NEXT: s_add_u32 s4, s4, s9 13096; GFX7-NEXT: s_addc_u32 s8, s5, s8 13097; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13098; GFX7-NEXT: s_mov_b32 s5, s8 13099; GFX7-NEXT: v_mov_b32_e32 v2, s7 13100; GFX7-NEXT: v_mov_b32_e32 v0, s6 13101; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13102; GFX7-NEXT: v_mov_b32_e32 v3, v0 13103; GFX7-NEXT: v_mov_b32_e32 v0, s4 13104; GFX7-NEXT: v_mov_b32_e32 v1, s5 13105; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13106; GFX7-NEXT: s_endpgm 13107; 13108; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13109; GFX10-WGP: ; %bb.0: ; %entry 13110; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13111; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13112; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13113; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13114; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13115; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13116; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13117; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13118; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13119; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13120; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13121; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13122; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13123; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13124; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13125; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13126; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13127; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13128; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13129; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13130; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 13131; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13132; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13133; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13134; GFX10-WGP-NEXT: buffer_gl0_inv 13135; GFX10-WGP-NEXT: s_endpgm 13136; 13137; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13138; GFX10-CU: ; %bb.0: ; %entry 13139; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13140; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13141; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13142; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13143; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13144; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13145; GFX10-CU-NEXT: s_mov_b32 s4, s8 13146; GFX10-CU-NEXT: s_mov_b32 s5, s9 13147; GFX10-CU-NEXT: s_mov_b32 s9, s10 13148; GFX10-CU-NEXT: s_mov_b32 s8, s11 13149; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13150; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13151; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13152; GFX10-CU-NEXT: s_mov_b32 s5, s8 13153; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13154; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13155; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13156; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13157; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13158; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13159; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13160; GFX10-CU-NEXT: s_endpgm 13161; 13162; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13163; SKIP-CACHE-INV: ; %bb.0: ; %entry 13164; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13165; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13166; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13167; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13168; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13169; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13170; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13171; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13172; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13173; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13174; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13175; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13176; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13177; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13178; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13179; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13180; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13181; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13182; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13183; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13184; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13185; SKIP-CACHE-INV-NEXT: s_endpgm 13186; 13187; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13188; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13189; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13190; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13191; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13192; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13193; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13194; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13195; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13196; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13197; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13198; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13199; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13200; 13201; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13202; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13203; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13204; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13205; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13206; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13207; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13208; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13209; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13210; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13211; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13212; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13213; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13214; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13215; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 13216; GFX90A-TGSPLIT-NEXT: s_endpgm 13217; 13218; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13219; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13220; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13221; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13222; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13223; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13224; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13225; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13226; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13227; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13228; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13229; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13230; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13231; 13232; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13233; GFX940-TGSPLIT: ; %bb.0: ; %entry 13234; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13235; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13236; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13237; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13238; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13239; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13240; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13241; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13242; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13243; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13244; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13245; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13246; GFX940-TGSPLIT-NEXT: buffer_inv sc0 13247; GFX940-TGSPLIT-NEXT: s_endpgm 13248; 13249; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13250; GFX11-WGP: ; %bb.0: ; %entry 13251; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13252; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13253; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13254; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13255; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13256; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13257; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13258; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13259; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13260; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13261; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 13262; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13263; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13264; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13265; GFX11-WGP-NEXT: buffer_gl0_inv 13266; GFX11-WGP-NEXT: s_endpgm 13267; 13268; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13269; GFX11-CU: ; %bb.0: ; %entry 13270; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13271; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13272; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13273; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13274; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13275; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13276; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13277; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13278; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13279; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13280; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13281; GFX11-CU-NEXT: s_endpgm 13282; 13283; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13284; GFX12-WGP: ; %bb.0: ; %entry 13285; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13286; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13287; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13288; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13289; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13290; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13291; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13292; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13293; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13294; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13295; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 13296; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 13297; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 13298; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13299; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 13300; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13301; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 13302; GFX12-WGP-NEXT: s_endpgm 13303; 13304; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: 13305; GFX12-CU: ; %bb.0: ; %entry 13306; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13307; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13308; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13309; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13310; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13311; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13312; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13313; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13314; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13315; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13316; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13317; GFX12-CU-NEXT: s_endpgm 13318 ptr %out, i32 %in, i32 %old) { 13319entry: 13320 %gep = getelementptr i32, ptr %out, i32 4 13321 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 13322 ret void 13323} 13324 13325define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( 13326; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13327; GFX7: ; %bb.0: ; %entry 13328; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13329; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13330; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13331; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13332; GFX7-NEXT: s_mov_b64 s[10:11], 16 13333; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13334; GFX7-NEXT: s_mov_b32 s4, s8 13335; GFX7-NEXT: s_mov_b32 s5, s9 13336; GFX7-NEXT: s_mov_b32 s9, s10 13337; GFX7-NEXT: s_mov_b32 s8, s11 13338; GFX7-NEXT: s_add_u32 s4, s4, s9 13339; GFX7-NEXT: s_addc_u32 s8, s5, s8 13340; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13341; GFX7-NEXT: s_mov_b32 s5, s8 13342; GFX7-NEXT: v_mov_b32_e32 v2, s7 13343; GFX7-NEXT: v_mov_b32_e32 v0, s6 13344; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13345; GFX7-NEXT: v_mov_b32_e32 v3, v0 13346; GFX7-NEXT: v_mov_b32_e32 v0, s4 13347; GFX7-NEXT: v_mov_b32_e32 v1, s5 13348; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13349; GFX7-NEXT: s_endpgm 13350; 13351; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13352; GFX10-WGP: ; %bb.0: ; %entry 13353; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13354; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13355; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13356; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13357; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13358; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13359; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13360; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13361; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13362; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13363; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13364; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13365; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13366; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13367; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13368; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13369; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13370; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13371; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13372; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13373; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 13374; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13375; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13376; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13377; GFX10-WGP-NEXT: buffer_gl0_inv 13378; GFX10-WGP-NEXT: s_endpgm 13379; 13380; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13381; GFX10-CU: ; %bb.0: ; %entry 13382; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13383; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13384; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13385; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13386; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13387; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13388; GFX10-CU-NEXT: s_mov_b32 s4, s8 13389; GFX10-CU-NEXT: s_mov_b32 s5, s9 13390; GFX10-CU-NEXT: s_mov_b32 s9, s10 13391; GFX10-CU-NEXT: s_mov_b32 s8, s11 13392; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13393; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13394; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13395; GFX10-CU-NEXT: s_mov_b32 s5, s8 13396; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13397; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13398; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13399; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13400; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13401; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13402; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13403; GFX10-CU-NEXT: s_endpgm 13404; 13405; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13406; SKIP-CACHE-INV: ; %bb.0: ; %entry 13407; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13408; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13409; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13410; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13411; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13412; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13413; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13414; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13415; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13416; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13417; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13418; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13419; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13420; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13421; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13422; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13423; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13424; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13425; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13426; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13427; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13428; SKIP-CACHE-INV-NEXT: s_endpgm 13429; 13430; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13431; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13432; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13433; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13434; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13435; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13436; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13437; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13438; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13439; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13440; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13441; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13442; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13443; 13444; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13445; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13446; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13447; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13448; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13449; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13450; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13451; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13452; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13453; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13454; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13455; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13456; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13457; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13458; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 13459; GFX90A-TGSPLIT-NEXT: s_endpgm 13460; 13461; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13462; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13463; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13464; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13465; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13466; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13467; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13468; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13469; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13470; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13471; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13472; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13473; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13474; 13475; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13476; GFX940-TGSPLIT: ; %bb.0: ; %entry 13477; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13478; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13479; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13480; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13481; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13482; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13483; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13484; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13485; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13486; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13487; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13488; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13489; GFX940-TGSPLIT-NEXT: buffer_inv sc0 13490; GFX940-TGSPLIT-NEXT: s_endpgm 13491; 13492; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13493; GFX11-WGP: ; %bb.0: ; %entry 13494; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13495; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13496; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13497; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13498; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13499; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13500; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13501; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13502; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13503; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13504; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 13505; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13506; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13507; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13508; GFX11-WGP-NEXT: buffer_gl0_inv 13509; GFX11-WGP-NEXT: s_endpgm 13510; 13511; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13512; GFX11-CU: ; %bb.0: ; %entry 13513; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13514; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13515; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13516; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13517; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13518; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13519; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13520; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13521; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13522; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13523; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13524; GFX11-CU-NEXT: s_endpgm 13525; 13526; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13527; GFX12-WGP: ; %bb.0: ; %entry 13528; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13529; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13530; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13531; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13532; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13533; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13534; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13535; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13536; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13537; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13538; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 13539; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 13540; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 13541; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13542; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 13543; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13544; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 13545; GFX12-WGP-NEXT: s_endpgm 13546; 13547; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: 13548; GFX12-CU: ; %bb.0: ; %entry 13549; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13550; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13551; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13552; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13553; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13554; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13555; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13556; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13557; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13558; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13559; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13560; GFX12-CU-NEXT: s_endpgm 13561 ptr %out, i32 %in, i32 %old) { 13562entry: 13563 %gep = getelementptr i32, ptr %out, i32 4 13564 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 13565 ret void 13566} 13567 13568define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( 13569; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13570; GFX7: ; %bb.0: ; %entry 13571; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13572; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13573; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13574; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13575; GFX7-NEXT: s_mov_b64 s[10:11], 16 13576; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13577; GFX7-NEXT: s_mov_b32 s4, s8 13578; GFX7-NEXT: s_mov_b32 s5, s9 13579; GFX7-NEXT: s_mov_b32 s9, s10 13580; GFX7-NEXT: s_mov_b32 s8, s11 13581; GFX7-NEXT: s_add_u32 s4, s4, s9 13582; GFX7-NEXT: s_addc_u32 s8, s5, s8 13583; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13584; GFX7-NEXT: s_mov_b32 s5, s8 13585; GFX7-NEXT: v_mov_b32_e32 v2, s7 13586; GFX7-NEXT: v_mov_b32_e32 v0, s6 13587; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13588; GFX7-NEXT: v_mov_b32_e32 v3, v0 13589; GFX7-NEXT: v_mov_b32_e32 v0, s4 13590; GFX7-NEXT: v_mov_b32_e32 v1, s5 13591; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13592; GFX7-NEXT: s_endpgm 13593; 13594; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13595; GFX10-WGP: ; %bb.0: ; %entry 13596; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13597; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13598; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13599; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13600; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13601; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13602; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13603; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13604; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13605; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13606; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13607; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13608; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13609; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13610; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13611; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13612; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13613; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13614; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13615; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13616; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13617; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13618; GFX10-WGP-NEXT: buffer_gl0_inv 13619; GFX10-WGP-NEXT: s_endpgm 13620; 13621; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13622; GFX10-CU: ; %bb.0: ; %entry 13623; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13624; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13625; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13626; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13627; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13628; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13629; GFX10-CU-NEXT: s_mov_b32 s4, s8 13630; GFX10-CU-NEXT: s_mov_b32 s5, s9 13631; GFX10-CU-NEXT: s_mov_b32 s9, s10 13632; GFX10-CU-NEXT: s_mov_b32 s8, s11 13633; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13634; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13635; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13636; GFX10-CU-NEXT: s_mov_b32 s5, s8 13637; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13638; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13639; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13640; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13641; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13642; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13643; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13644; GFX10-CU-NEXT: s_endpgm 13645; 13646; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13647; SKIP-CACHE-INV: ; %bb.0: ; %entry 13648; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13649; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13650; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13651; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13652; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13653; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13654; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13655; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13656; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13657; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13658; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13659; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13660; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13661; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13662; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13663; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13664; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13665; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13666; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13667; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13668; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13669; SKIP-CACHE-INV-NEXT: s_endpgm 13670; 13671; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13672; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13673; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13674; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13675; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13676; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13677; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13678; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13679; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13680; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13681; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13682; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13683; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13684; 13685; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13686; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13687; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13688; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13689; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13690; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13691; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13692; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13693; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13694; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13695; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13696; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13697; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13698; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 13699; GFX90A-TGSPLIT-NEXT: s_endpgm 13700; 13701; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13702; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13703; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13704; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13705; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13706; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13707; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13708; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13709; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13710; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13711; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13712; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13713; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13714; 13715; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13716; GFX940-TGSPLIT: ; %bb.0: ; %entry 13717; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13718; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13719; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13720; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13721; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13722; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13723; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13724; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13725; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13726; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13727; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13728; GFX940-TGSPLIT-NEXT: buffer_inv sc0 13729; GFX940-TGSPLIT-NEXT: s_endpgm 13730; 13731; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13732; GFX11-WGP: ; %bb.0: ; %entry 13733; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13734; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13735; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13736; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13737; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13738; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13739; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13740; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13741; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13742; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13743; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13744; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13745; GFX11-WGP-NEXT: buffer_gl0_inv 13746; GFX11-WGP-NEXT: s_endpgm 13747; 13748; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13749; GFX11-CU: ; %bb.0: ; %entry 13750; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13751; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13752; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13753; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13754; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13755; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13756; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13757; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13758; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13759; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13760; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13761; GFX11-CU-NEXT: s_endpgm 13762; 13763; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13764; GFX12-WGP: ; %bb.0: ; %entry 13765; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13766; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13767; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13768; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13769; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13770; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13771; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13772; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13773; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13774; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13775; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 13776; GFX12-WGP-NEXT: s_wait_storecnt 0x0 13777; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 13778; GFX12-WGP-NEXT: s_endpgm 13779; 13780; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: 13781; GFX12-CU: ; %bb.0: ; %entry 13782; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13783; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13784; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13785; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13786; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13787; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13788; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13789; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13790; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13791; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13792; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13793; GFX12-CU-NEXT: s_endpgm 13794 ptr %out, i32 %in, i32 %old) { 13795entry: 13796 %gep = getelementptr i32, ptr %out, i32 4 13797 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire 13798 ret void 13799} 13800 13801define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( 13802; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13803; GFX7: ; %bb.0: ; %entry 13804; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13805; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13806; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13807; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13808; GFX7-NEXT: s_mov_b64 s[10:11], 16 13809; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13810; GFX7-NEXT: s_mov_b32 s4, s8 13811; GFX7-NEXT: s_mov_b32 s5, s9 13812; GFX7-NEXT: s_mov_b32 s9, s10 13813; GFX7-NEXT: s_mov_b32 s8, s11 13814; GFX7-NEXT: s_add_u32 s4, s4, s9 13815; GFX7-NEXT: s_addc_u32 s8, s5, s8 13816; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13817; GFX7-NEXT: s_mov_b32 s5, s8 13818; GFX7-NEXT: v_mov_b32_e32 v2, s7 13819; GFX7-NEXT: v_mov_b32_e32 v0, s6 13820; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13821; GFX7-NEXT: v_mov_b32_e32 v3, v0 13822; GFX7-NEXT: v_mov_b32_e32 v0, s4 13823; GFX7-NEXT: v_mov_b32_e32 v1, s5 13824; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13825; GFX7-NEXT: s_endpgm 13826; 13827; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13828; GFX10-WGP: ; %bb.0: ; %entry 13829; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13830; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13831; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13832; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13833; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13834; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13835; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13836; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13837; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13838; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13839; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13840; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13841; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13842; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13843; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13844; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13845; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13846; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13847; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13848; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13849; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13850; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13851; GFX10-WGP-NEXT: buffer_gl0_inv 13852; GFX10-WGP-NEXT: s_endpgm 13853; 13854; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13855; GFX10-CU: ; %bb.0: ; %entry 13856; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13857; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13858; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13859; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13860; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13861; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13862; GFX10-CU-NEXT: s_mov_b32 s4, s8 13863; GFX10-CU-NEXT: s_mov_b32 s5, s9 13864; GFX10-CU-NEXT: s_mov_b32 s9, s10 13865; GFX10-CU-NEXT: s_mov_b32 s8, s11 13866; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13867; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13868; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13869; GFX10-CU-NEXT: s_mov_b32 s5, s8 13870; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13871; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13872; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13873; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13874; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13875; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13876; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13877; GFX10-CU-NEXT: s_endpgm 13878; 13879; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13880; SKIP-CACHE-INV: ; %bb.0: ; %entry 13881; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13882; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13883; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13884; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13885; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13886; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13887; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13888; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13889; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13890; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13891; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13892; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13893; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13894; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13895; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13896; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13897; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13898; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13899; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13900; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13901; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13902; SKIP-CACHE-INV-NEXT: s_endpgm 13903; 13904; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13905; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13906; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13907; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13908; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13909; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13910; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13911; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13912; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13913; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13914; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13915; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13916; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13917; 13918; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13919; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13920; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13921; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13922; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13923; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13924; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13925; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13926; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13927; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13928; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13929; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13930; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13931; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 13932; GFX90A-TGSPLIT-NEXT: s_endpgm 13933; 13934; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13935; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13936; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13937; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13938; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13939; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13940; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13941; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13942; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13943; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13944; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13945; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13946; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13947; 13948; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13949; GFX940-TGSPLIT: ; %bb.0: ; %entry 13950; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13951; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13952; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13953; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13954; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13955; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13956; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13957; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13958; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13959; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13960; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 13961; GFX940-TGSPLIT-NEXT: buffer_inv sc0 13962; GFX940-TGSPLIT-NEXT: s_endpgm 13963; 13964; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13965; GFX11-WGP: ; %bb.0: ; %entry 13966; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13967; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13968; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13969; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13970; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13971; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13972; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13973; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13974; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13975; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13976; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13977; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 13978; GFX11-WGP-NEXT: buffer_gl0_inv 13979; GFX11-WGP-NEXT: s_endpgm 13980; 13981; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13982; GFX11-CU: ; %bb.0: ; %entry 13983; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13984; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13985; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13986; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13987; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13988; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13989; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13990; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13991; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13992; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13993; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13994; GFX11-CU-NEXT: s_endpgm 13995; 13996; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 13997; GFX12-WGP: ; %bb.0: ; %entry 13998; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13999; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14000; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14001; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14002; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14003; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14004; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14005; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14006; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14007; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14008; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 14009; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14010; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 14011; GFX12-WGP-NEXT: s_endpgm 14012; 14013; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: 14014; GFX12-CU: ; %bb.0: ; %entry 14015; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14016; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14017; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14018; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14019; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14020; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14021; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14022; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14023; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14024; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14025; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14026; GFX12-CU-NEXT: s_endpgm 14027 ptr %out, i32 %in, i32 %old) { 14028entry: 14029 %gep = getelementptr i32, ptr %out, i32 4 14030 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 14031 ret void 14032} 14033 14034define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( 14035; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14036; GFX7: ; %bb.0: ; %entry 14037; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14038; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14039; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14040; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14041; GFX7-NEXT: s_mov_b64 s[10:11], 16 14042; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14043; GFX7-NEXT: s_mov_b32 s4, s8 14044; GFX7-NEXT: s_mov_b32 s5, s9 14045; GFX7-NEXT: s_mov_b32 s9, s10 14046; GFX7-NEXT: s_mov_b32 s8, s11 14047; GFX7-NEXT: s_add_u32 s4, s4, s9 14048; GFX7-NEXT: s_addc_u32 s8, s5, s8 14049; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14050; GFX7-NEXT: s_mov_b32 s5, s8 14051; GFX7-NEXT: v_mov_b32_e32 v2, s7 14052; GFX7-NEXT: v_mov_b32_e32 v0, s6 14053; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14054; GFX7-NEXT: v_mov_b32_e32 v3, v0 14055; GFX7-NEXT: v_mov_b32_e32 v0, s4 14056; GFX7-NEXT: v_mov_b32_e32 v1, s5 14057; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14058; GFX7-NEXT: s_endpgm 14059; 14060; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14061; GFX10-WGP: ; %bb.0: ; %entry 14062; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14063; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14064; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14065; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14066; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14067; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14068; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14069; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14070; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14071; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14072; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14073; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14074; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14075; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14076; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14077; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14078; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14079; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14080; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14081; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14082; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 14083; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14084; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14085; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14086; GFX10-WGP-NEXT: buffer_gl0_inv 14087; GFX10-WGP-NEXT: s_endpgm 14088; 14089; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14090; GFX10-CU: ; %bb.0: ; %entry 14091; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14092; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14093; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14094; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14095; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14096; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14097; GFX10-CU-NEXT: s_mov_b32 s4, s8 14098; GFX10-CU-NEXT: s_mov_b32 s5, s9 14099; GFX10-CU-NEXT: s_mov_b32 s9, s10 14100; GFX10-CU-NEXT: s_mov_b32 s8, s11 14101; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14102; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14103; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14104; GFX10-CU-NEXT: s_mov_b32 s5, s8 14105; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14106; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14107; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14108; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14109; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14110; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14111; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14112; GFX10-CU-NEXT: s_endpgm 14113; 14114; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14115; SKIP-CACHE-INV: ; %bb.0: ; %entry 14116; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14117; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14118; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14119; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14120; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14121; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14122; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14123; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14124; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14125; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14126; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14127; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14128; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14129; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14130; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14131; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14132; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14133; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14134; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14135; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14136; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14137; SKIP-CACHE-INV-NEXT: s_endpgm 14138; 14139; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14140; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14141; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14142; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14143; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14144; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14145; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14146; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14147; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14148; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14149; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14150; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14151; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14152; 14153; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14154; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14155; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14156; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14157; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14158; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14159; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14160; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14161; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14162; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14163; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14164; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14165; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14166; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14167; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 14168; GFX90A-TGSPLIT-NEXT: s_endpgm 14169; 14170; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14171; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14172; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14173; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14174; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14175; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14176; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14177; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14178; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14179; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14180; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14181; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14182; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14183; 14184; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14185; GFX940-TGSPLIT: ; %bb.0: ; %entry 14186; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14187; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14188; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14189; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14190; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14191; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14192; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14193; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14194; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14195; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14196; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14197; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14198; GFX940-TGSPLIT-NEXT: buffer_inv sc0 14199; GFX940-TGSPLIT-NEXT: s_endpgm 14200; 14201; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14202; GFX11-WGP: ; %bb.0: ; %entry 14203; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14204; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14205; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14206; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14207; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14208; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14209; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14210; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14211; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14212; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14213; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 14214; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14215; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14216; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14217; GFX11-WGP-NEXT: buffer_gl0_inv 14218; GFX11-WGP-NEXT: s_endpgm 14219; 14220; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14221; GFX11-CU: ; %bb.0: ; %entry 14222; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14223; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14224; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14225; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14226; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14227; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14228; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14229; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14230; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14231; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14232; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14233; GFX11-CU-NEXT: s_endpgm 14234; 14235; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14236; GFX12-WGP: ; %bb.0: ; %entry 14237; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14238; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14239; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14240; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14241; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14242; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14243; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14244; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14245; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14246; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14247; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 14248; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 14249; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 14250; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14251; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 14252; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14253; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 14254; GFX12-WGP-NEXT: s_endpgm 14255; 14256; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: 14257; GFX12-CU: ; %bb.0: ; %entry 14258; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14259; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14260; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14261; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14262; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14263; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14264; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14265; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14266; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14267; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14268; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14269; GFX12-CU-NEXT: s_endpgm 14270 ptr %out, i32 %in, i32 %old) { 14271entry: 14272 %gep = getelementptr i32, ptr %out, i32 4 14273 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 14274 ret void 14275} 14276 14277define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( 14278; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14279; GFX7: ; %bb.0: ; %entry 14280; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14281; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14282; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14283; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14284; GFX7-NEXT: s_mov_b64 s[10:11], 16 14285; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14286; GFX7-NEXT: s_mov_b32 s4, s8 14287; GFX7-NEXT: s_mov_b32 s5, s9 14288; GFX7-NEXT: s_mov_b32 s9, s10 14289; GFX7-NEXT: s_mov_b32 s8, s11 14290; GFX7-NEXT: s_add_u32 s4, s4, s9 14291; GFX7-NEXT: s_addc_u32 s8, s5, s8 14292; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14293; GFX7-NEXT: s_mov_b32 s5, s8 14294; GFX7-NEXT: v_mov_b32_e32 v2, s7 14295; GFX7-NEXT: v_mov_b32_e32 v0, s6 14296; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14297; GFX7-NEXT: v_mov_b32_e32 v3, v0 14298; GFX7-NEXT: v_mov_b32_e32 v0, s4 14299; GFX7-NEXT: v_mov_b32_e32 v1, s5 14300; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14301; GFX7-NEXT: s_endpgm 14302; 14303; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14304; GFX10-WGP: ; %bb.0: ; %entry 14305; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14306; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14307; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14308; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14309; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14310; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14311; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14312; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14313; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14314; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14315; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14316; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14317; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14318; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14319; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14320; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14321; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14322; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14323; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14324; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14325; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 14326; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14327; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14328; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14329; GFX10-WGP-NEXT: buffer_gl0_inv 14330; GFX10-WGP-NEXT: s_endpgm 14331; 14332; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14333; GFX10-CU: ; %bb.0: ; %entry 14334; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14335; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14336; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14337; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14338; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14339; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14340; GFX10-CU-NEXT: s_mov_b32 s4, s8 14341; GFX10-CU-NEXT: s_mov_b32 s5, s9 14342; GFX10-CU-NEXT: s_mov_b32 s9, s10 14343; GFX10-CU-NEXT: s_mov_b32 s8, s11 14344; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14345; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14346; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14347; GFX10-CU-NEXT: s_mov_b32 s5, s8 14348; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14349; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14350; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14351; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14352; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14353; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14354; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14355; GFX10-CU-NEXT: s_endpgm 14356; 14357; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14358; SKIP-CACHE-INV: ; %bb.0: ; %entry 14359; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14360; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14361; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14362; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14363; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14364; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14365; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14366; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14367; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14368; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14369; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14370; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14371; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14372; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14373; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14374; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14375; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14377; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14378; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14379; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14380; SKIP-CACHE-INV-NEXT: s_endpgm 14381; 14382; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14383; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14384; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14385; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14386; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14387; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14388; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14389; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14390; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14391; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14392; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14393; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14394; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14395; 14396; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14397; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14398; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14399; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14400; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14401; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14402; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14403; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14404; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14405; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14406; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14407; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14408; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14409; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14410; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 14411; GFX90A-TGSPLIT-NEXT: s_endpgm 14412; 14413; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14414; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14415; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14416; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14417; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14418; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14419; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14420; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14421; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14422; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14423; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14424; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14425; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14426; 14427; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14428; GFX940-TGSPLIT: ; %bb.0: ; %entry 14429; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14430; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14431; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14432; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14433; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14434; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14435; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14436; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14437; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14438; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14439; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14440; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14441; GFX940-TGSPLIT-NEXT: buffer_inv sc0 14442; GFX940-TGSPLIT-NEXT: s_endpgm 14443; 14444; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14445; GFX11-WGP: ; %bb.0: ; %entry 14446; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14447; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14448; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14449; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14450; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14451; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14452; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14453; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14454; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14455; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14456; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 14457; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14458; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14459; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14460; GFX11-WGP-NEXT: buffer_gl0_inv 14461; GFX11-WGP-NEXT: s_endpgm 14462; 14463; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14464; GFX11-CU: ; %bb.0: ; %entry 14465; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14466; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14467; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14468; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14469; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14470; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14471; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14472; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14473; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14474; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14475; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14476; GFX11-CU-NEXT: s_endpgm 14477; 14478; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14479; GFX12-WGP: ; %bb.0: ; %entry 14480; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14481; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14482; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14483; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14484; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14485; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14486; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14487; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14488; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14489; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14490; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 14491; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 14492; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 14493; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14494; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 14495; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14496; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 14497; GFX12-WGP-NEXT: s_endpgm 14498; 14499; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: 14500; GFX12-CU: ; %bb.0: ; %entry 14501; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14502; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14503; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14504; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14505; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14506; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14507; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14508; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14509; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14510; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14511; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14512; GFX12-CU-NEXT: s_endpgm 14513 ptr %out, i32 %in, i32 %old) { 14514entry: 14515 %gep = getelementptr i32, ptr %out, i32 4 14516 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 14517 ret void 14518} 14519 14520define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( 14521; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14522; GFX7: ; %bb.0: ; %entry 14523; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14524; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14525; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14526; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14527; GFX7-NEXT: s_mov_b64 s[10:11], 16 14528; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14529; GFX7-NEXT: s_mov_b32 s4, s8 14530; GFX7-NEXT: s_mov_b32 s5, s9 14531; GFX7-NEXT: s_mov_b32 s9, s10 14532; GFX7-NEXT: s_mov_b32 s8, s11 14533; GFX7-NEXT: s_add_u32 s4, s4, s9 14534; GFX7-NEXT: s_addc_u32 s8, s5, s8 14535; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14536; GFX7-NEXT: s_mov_b32 s5, s8 14537; GFX7-NEXT: v_mov_b32_e32 v2, s7 14538; GFX7-NEXT: v_mov_b32_e32 v0, s6 14539; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14540; GFX7-NEXT: v_mov_b32_e32 v3, v0 14541; GFX7-NEXT: v_mov_b32_e32 v0, s4 14542; GFX7-NEXT: v_mov_b32_e32 v1, s5 14543; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14544; GFX7-NEXT: s_endpgm 14545; 14546; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14547; GFX10-WGP: ; %bb.0: ; %entry 14548; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14549; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14550; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14551; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14552; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14553; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14554; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14555; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14556; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14557; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14558; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14559; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14560; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14561; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14562; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14563; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14564; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14565; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14566; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14567; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14568; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 14569; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14570; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14571; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14572; GFX10-WGP-NEXT: buffer_gl0_inv 14573; GFX10-WGP-NEXT: s_endpgm 14574; 14575; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14576; GFX10-CU: ; %bb.0: ; %entry 14577; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14578; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14579; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14580; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14581; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14582; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14583; GFX10-CU-NEXT: s_mov_b32 s4, s8 14584; GFX10-CU-NEXT: s_mov_b32 s5, s9 14585; GFX10-CU-NEXT: s_mov_b32 s9, s10 14586; GFX10-CU-NEXT: s_mov_b32 s8, s11 14587; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14588; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14589; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14590; GFX10-CU-NEXT: s_mov_b32 s5, s8 14591; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14592; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14593; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14594; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14595; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14596; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14597; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14598; GFX10-CU-NEXT: s_endpgm 14599; 14600; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14601; SKIP-CACHE-INV: ; %bb.0: ; %entry 14602; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14603; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14604; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14605; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14606; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14607; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14608; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14609; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14610; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14611; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14612; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14613; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14614; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14615; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14616; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14617; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14618; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14619; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14620; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14621; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14622; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14623; SKIP-CACHE-INV-NEXT: s_endpgm 14624; 14625; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14626; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14627; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14628; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14629; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14630; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14631; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14632; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14633; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14634; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14635; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14636; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14637; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14638; 14639; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14640; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14641; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14642; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14643; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14644; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14645; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14646; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14647; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14648; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14649; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14650; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14651; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14652; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14653; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 14654; GFX90A-TGSPLIT-NEXT: s_endpgm 14655; 14656; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14657; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14658; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14659; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14660; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14661; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14662; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14663; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14664; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14665; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14666; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14667; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14668; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14669; 14670; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14671; GFX940-TGSPLIT: ; %bb.0: ; %entry 14672; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14673; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14674; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14675; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14676; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14677; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14678; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14679; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14680; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14681; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14682; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14683; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14684; GFX940-TGSPLIT-NEXT: buffer_inv sc0 14685; GFX940-TGSPLIT-NEXT: s_endpgm 14686; 14687; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14688; GFX11-WGP: ; %bb.0: ; %entry 14689; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14690; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14691; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14692; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14693; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14694; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14695; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14696; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14697; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14698; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14699; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 14700; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14701; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14702; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14703; GFX11-WGP-NEXT: buffer_gl0_inv 14704; GFX11-WGP-NEXT: s_endpgm 14705; 14706; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14707; GFX11-CU: ; %bb.0: ; %entry 14708; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14709; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14710; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14711; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14712; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14713; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14714; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14715; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14716; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14717; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14718; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14719; GFX11-CU-NEXT: s_endpgm 14720; 14721; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14722; GFX12-WGP: ; %bb.0: ; %entry 14723; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14724; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14725; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14726; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14727; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14728; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14729; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14730; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14731; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14732; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14733; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 14734; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 14735; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 14736; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14737; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 14738; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14739; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 14740; GFX12-WGP-NEXT: s_endpgm 14741; 14742; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: 14743; GFX12-CU: ; %bb.0: ; %entry 14744; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14745; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14746; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14747; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14748; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14749; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14750; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14751; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14752; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14753; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14754; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14755; GFX12-CU-NEXT: s_endpgm 14756 ptr %out, i32 %in, i32 %old) { 14757entry: 14758 %gep = getelementptr i32, ptr %out, i32 4 14759 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 14760 ret void 14761} 14762 14763define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( 14764; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14765; GFX7: ; %bb.0: ; %entry 14766; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14767; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14768; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14769; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14770; GFX7-NEXT: s_mov_b64 s[10:11], 16 14771; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14772; GFX7-NEXT: s_mov_b32 s4, s8 14773; GFX7-NEXT: s_mov_b32 s5, s9 14774; GFX7-NEXT: s_mov_b32 s9, s10 14775; GFX7-NEXT: s_mov_b32 s8, s11 14776; GFX7-NEXT: s_add_u32 s4, s4, s9 14777; GFX7-NEXT: s_addc_u32 s8, s5, s8 14778; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14779; GFX7-NEXT: s_mov_b32 s5, s8 14780; GFX7-NEXT: v_mov_b32_e32 v2, s7 14781; GFX7-NEXT: v_mov_b32_e32 v0, s6 14782; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14783; GFX7-NEXT: v_mov_b32_e32 v3, v0 14784; GFX7-NEXT: v_mov_b32_e32 v0, s4 14785; GFX7-NEXT: v_mov_b32_e32 v1, s5 14786; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14787; GFX7-NEXT: s_endpgm 14788; 14789; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14790; GFX10-WGP: ; %bb.0: ; %entry 14791; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14792; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14793; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14794; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14795; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14796; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14797; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14798; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14799; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14800; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14801; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14802; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14803; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14804; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14805; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14806; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14807; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14808; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14809; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14810; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14811; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 14812; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14813; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14814; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14815; GFX10-WGP-NEXT: buffer_gl0_inv 14816; GFX10-WGP-NEXT: s_endpgm 14817; 14818; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14819; GFX10-CU: ; %bb.0: ; %entry 14820; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14821; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14822; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14823; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14824; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14825; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14826; GFX10-CU-NEXT: s_mov_b32 s4, s8 14827; GFX10-CU-NEXT: s_mov_b32 s5, s9 14828; GFX10-CU-NEXT: s_mov_b32 s9, s10 14829; GFX10-CU-NEXT: s_mov_b32 s8, s11 14830; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14831; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14832; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14833; GFX10-CU-NEXT: s_mov_b32 s5, s8 14834; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14835; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14836; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14837; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14838; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14839; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14840; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14841; GFX10-CU-NEXT: s_endpgm 14842; 14843; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14844; SKIP-CACHE-INV: ; %bb.0: ; %entry 14845; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14846; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14847; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14848; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14849; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14850; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14851; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14852; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14853; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14854; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14855; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14856; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14857; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14858; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14861; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14862; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14863; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14864; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14865; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14866; SKIP-CACHE-INV-NEXT: s_endpgm 14867; 14868; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14869; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14870; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14871; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14872; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14873; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14874; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14875; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14876; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14877; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14878; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14879; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14880; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14881; 14882; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14883; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14884; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14885; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14886; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14887; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14888; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14889; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14890; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14891; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14892; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14893; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14894; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14895; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14896; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 14897; GFX90A-TGSPLIT-NEXT: s_endpgm 14898; 14899; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14900; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14901; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14902; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14903; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14904; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14905; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14906; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14907; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14908; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14909; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14910; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14911; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14912; 14913; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14914; GFX940-TGSPLIT: ; %bb.0: ; %entry 14915; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14916; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14917; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14918; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14919; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14920; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14921; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14922; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14923; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14924; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14925; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14926; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 14927; GFX940-TGSPLIT-NEXT: buffer_inv sc0 14928; GFX940-TGSPLIT-NEXT: s_endpgm 14929; 14930; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14931; GFX11-WGP: ; %bb.0: ; %entry 14932; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14933; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14934; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14935; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14936; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14937; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14938; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14939; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14940; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14941; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14942; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 14943; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14944; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14945; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 14946; GFX11-WGP-NEXT: buffer_gl0_inv 14947; GFX11-WGP-NEXT: s_endpgm 14948; 14949; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14950; GFX11-CU: ; %bb.0: ; %entry 14951; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14952; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14953; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14954; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14955; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14956; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14957; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14958; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14959; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14960; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14961; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14962; GFX11-CU-NEXT: s_endpgm 14963; 14964; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14965; GFX12-WGP: ; %bb.0: ; %entry 14966; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14967; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14968; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14969; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14970; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14971; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14972; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14973; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14974; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14975; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14976; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 14977; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 14978; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 14979; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14980; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 14981; GFX12-WGP-NEXT: s_wait_storecnt 0x0 14982; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 14983; GFX12-WGP-NEXT: s_endpgm 14984; 14985; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: 14986; GFX12-CU: ; %bb.0: ; %entry 14987; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14988; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14989; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14990; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14991; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14992; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14993; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14994; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14995; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14996; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14997; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14998; GFX12-CU-NEXT: s_endpgm 14999 ptr %out, i32 %in, i32 %old) { 15000entry: 15001 %gep = getelementptr i32, ptr %out, i32 4 15002 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst 15003 ret void 15004} 15005 15006define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( 15007; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15008; GFX7: ; %bb.0: ; %entry 15009; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15010; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15011; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15012; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15013; GFX7-NEXT: s_mov_b64 s[10:11], 16 15014; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15015; GFX7-NEXT: s_mov_b32 s4, s8 15016; GFX7-NEXT: s_mov_b32 s5, s9 15017; GFX7-NEXT: s_mov_b32 s9, s10 15018; GFX7-NEXT: s_mov_b32 s8, s11 15019; GFX7-NEXT: s_add_u32 s4, s4, s9 15020; GFX7-NEXT: s_addc_u32 s8, s5, s8 15021; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15022; GFX7-NEXT: s_mov_b32 s5, s8 15023; GFX7-NEXT: v_mov_b32_e32 v2, s7 15024; GFX7-NEXT: v_mov_b32_e32 v0, s6 15025; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15026; GFX7-NEXT: v_mov_b32_e32 v3, v0 15027; GFX7-NEXT: v_mov_b32_e32 v0, s4 15028; GFX7-NEXT: v_mov_b32_e32 v1, s5 15029; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15030; GFX7-NEXT: s_endpgm 15031; 15032; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15033; GFX10-WGP: ; %bb.0: ; %entry 15034; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15035; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15036; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15037; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15038; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15039; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15040; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15041; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15042; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15043; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15044; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15045; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15046; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15047; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15048; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15049; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15050; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15051; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15052; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15053; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15054; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 15055; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15056; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15057; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15058; GFX10-WGP-NEXT: buffer_gl0_inv 15059; GFX10-WGP-NEXT: s_endpgm 15060; 15061; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15062; GFX10-CU: ; %bb.0: ; %entry 15063; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15064; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15065; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15066; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15067; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15068; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15069; GFX10-CU-NEXT: s_mov_b32 s4, s8 15070; GFX10-CU-NEXT: s_mov_b32 s5, s9 15071; GFX10-CU-NEXT: s_mov_b32 s9, s10 15072; GFX10-CU-NEXT: s_mov_b32 s8, s11 15073; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15074; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15075; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15076; GFX10-CU-NEXT: s_mov_b32 s5, s8 15077; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15079; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15080; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15081; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15082; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15083; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15084; GFX10-CU-NEXT: s_endpgm 15085; 15086; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15087; SKIP-CACHE-INV: ; %bb.0: ; %entry 15088; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15089; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15090; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15091; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15092; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15093; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15094; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15095; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15096; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15097; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15098; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15099; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15100; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15101; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15102; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15103; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15104; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15105; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15106; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15107; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15108; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15109; SKIP-CACHE-INV-NEXT: s_endpgm 15110; 15111; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15112; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15113; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15114; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15115; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15116; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15117; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15118; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15119; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15120; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15121; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15122; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15123; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15124; 15125; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15126; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15127; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15128; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15129; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15130; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15131; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15132; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15133; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15134; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15135; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15136; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15137; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15138; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15139; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 15140; GFX90A-TGSPLIT-NEXT: s_endpgm 15141; 15142; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15143; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15144; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15145; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15146; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15147; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15148; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15149; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15150; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15151; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15152; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15153; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15154; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15155; 15156; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15157; GFX940-TGSPLIT: ; %bb.0: ; %entry 15158; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15159; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15160; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15161; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15162; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15163; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15164; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15165; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15166; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15167; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15168; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15169; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15170; GFX940-TGSPLIT-NEXT: buffer_inv sc0 15171; GFX940-TGSPLIT-NEXT: s_endpgm 15172; 15173; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15174; GFX11-WGP: ; %bb.0: ; %entry 15175; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15176; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15177; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15178; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15179; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15180; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15181; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15182; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15183; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15184; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15185; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 15186; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15187; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15188; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15189; GFX11-WGP-NEXT: buffer_gl0_inv 15190; GFX11-WGP-NEXT: s_endpgm 15191; 15192; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15193; GFX11-CU: ; %bb.0: ; %entry 15194; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15195; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15196; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15197; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15198; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15199; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15200; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15201; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15202; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15203; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15204; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15205; GFX11-CU-NEXT: s_endpgm 15206; 15207; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15208; GFX12-WGP: ; %bb.0: ; %entry 15209; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15210; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15211; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15212; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15213; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15214; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15215; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15216; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15217; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15218; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15219; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 15220; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 15221; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 15222; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15223; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 15224; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15225; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 15226; GFX12-WGP-NEXT: s_endpgm 15227; 15228; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: 15229; GFX12-CU: ; %bb.0: ; %entry 15230; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15231; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15232; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15233; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15234; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15235; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15236; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15237; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15238; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15239; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15240; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15241; GFX12-CU-NEXT: s_endpgm 15242 ptr %out, i32 %in, i32 %old) { 15243entry: 15244 %gep = getelementptr i32, ptr %out, i32 4 15245 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst 15246 ret void 15247} 15248 15249define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( 15250; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15251; GFX7: ; %bb.0: ; %entry 15252; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15253; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15254; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15255; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15256; GFX7-NEXT: s_mov_b64 s[10:11], 16 15257; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15258; GFX7-NEXT: s_mov_b32 s4, s8 15259; GFX7-NEXT: s_mov_b32 s5, s9 15260; GFX7-NEXT: s_mov_b32 s9, s10 15261; GFX7-NEXT: s_mov_b32 s8, s11 15262; GFX7-NEXT: s_add_u32 s4, s4, s9 15263; GFX7-NEXT: s_addc_u32 s8, s5, s8 15264; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15265; GFX7-NEXT: s_mov_b32 s5, s8 15266; GFX7-NEXT: v_mov_b32_e32 v2, s7 15267; GFX7-NEXT: v_mov_b32_e32 v0, s6 15268; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15269; GFX7-NEXT: v_mov_b32_e32 v3, v0 15270; GFX7-NEXT: v_mov_b32_e32 v0, s4 15271; GFX7-NEXT: v_mov_b32_e32 v1, s5 15272; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15273; GFX7-NEXT: s_endpgm 15274; 15275; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15276; GFX10-WGP: ; %bb.0: ; %entry 15277; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15278; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15279; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15280; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15281; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15282; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15283; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15284; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15285; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15286; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15287; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15288; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15289; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15290; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15291; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15292; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15293; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15294; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15295; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15296; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15297; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 15298; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15299; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15300; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15301; GFX10-WGP-NEXT: buffer_gl0_inv 15302; GFX10-WGP-NEXT: s_endpgm 15303; 15304; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15305; GFX10-CU: ; %bb.0: ; %entry 15306; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15307; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15308; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15309; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15310; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15311; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15312; GFX10-CU-NEXT: s_mov_b32 s4, s8 15313; GFX10-CU-NEXT: s_mov_b32 s5, s9 15314; GFX10-CU-NEXT: s_mov_b32 s9, s10 15315; GFX10-CU-NEXT: s_mov_b32 s8, s11 15316; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15317; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15318; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15319; GFX10-CU-NEXT: s_mov_b32 s5, s8 15320; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15321; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15322; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15323; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15324; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15325; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15326; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15327; GFX10-CU-NEXT: s_endpgm 15328; 15329; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15330; SKIP-CACHE-INV: ; %bb.0: ; %entry 15331; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15332; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15333; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15334; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15335; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15336; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15337; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15338; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15339; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15340; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15341; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15342; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15343; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15344; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15345; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15347; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15348; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15349; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15351; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15352; SKIP-CACHE-INV-NEXT: s_endpgm 15353; 15354; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15355; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15356; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15357; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15358; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15359; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15360; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15361; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15362; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15363; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15364; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15365; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15366; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15367; 15368; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15369; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15370; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15371; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15372; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15373; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15374; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15375; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15376; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15377; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15378; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15379; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15380; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15381; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15382; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 15383; GFX90A-TGSPLIT-NEXT: s_endpgm 15384; 15385; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15386; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15387; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15388; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15389; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15390; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15391; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15392; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15393; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15394; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15395; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15396; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15397; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15398; 15399; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15400; GFX940-TGSPLIT: ; %bb.0: ; %entry 15401; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15402; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15403; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15404; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15405; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15406; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15407; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15408; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15409; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15410; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15411; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15412; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15413; GFX940-TGSPLIT-NEXT: buffer_inv sc0 15414; GFX940-TGSPLIT-NEXT: s_endpgm 15415; 15416; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15417; GFX11-WGP: ; %bb.0: ; %entry 15418; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15419; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15420; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15421; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15422; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15423; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15424; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15425; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15426; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15427; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15428; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 15429; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15430; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15431; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15432; GFX11-WGP-NEXT: buffer_gl0_inv 15433; GFX11-WGP-NEXT: s_endpgm 15434; 15435; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15436; GFX11-CU: ; %bb.0: ; %entry 15437; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15438; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15439; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15440; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15441; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15442; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15443; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15444; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15445; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15446; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15447; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15448; GFX11-CU-NEXT: s_endpgm 15449; 15450; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15451; GFX12-WGP: ; %bb.0: ; %entry 15452; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15453; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15454; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15455; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15456; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15457; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15458; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15459; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15460; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15461; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15462; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 15463; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 15464; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 15465; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15466; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 15467; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15468; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 15469; GFX12-WGP-NEXT: s_endpgm 15470; 15471; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: 15472; GFX12-CU: ; %bb.0: ; %entry 15473; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15474; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15475; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15476; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15477; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15478; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15479; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15480; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15481; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15482; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15483; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15484; GFX12-CU-NEXT: s_endpgm 15485 ptr %out, i32 %in, i32 %old) { 15486entry: 15487 %gep = getelementptr i32, ptr %out, i32 4 15488 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst 15489 ret void 15490} 15491 15492define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( 15493; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15494; GFX7: ; %bb.0: ; %entry 15495; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15496; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15497; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15498; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15499; GFX7-NEXT: s_mov_b64 s[10:11], 16 15500; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15501; GFX7-NEXT: s_mov_b32 s4, s8 15502; GFX7-NEXT: s_mov_b32 s5, s9 15503; GFX7-NEXT: s_mov_b32 s9, s10 15504; GFX7-NEXT: s_mov_b32 s8, s11 15505; GFX7-NEXT: s_add_u32 s4, s4, s9 15506; GFX7-NEXT: s_addc_u32 s8, s5, s8 15507; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15508; GFX7-NEXT: s_mov_b32 s5, s8 15509; GFX7-NEXT: v_mov_b32_e32 v2, s7 15510; GFX7-NEXT: v_mov_b32_e32 v0, s6 15511; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15512; GFX7-NEXT: v_mov_b32_e32 v3, v0 15513; GFX7-NEXT: v_mov_b32_e32 v0, s4 15514; GFX7-NEXT: v_mov_b32_e32 v1, s5 15515; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15516; GFX7-NEXT: s_endpgm 15517; 15518; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15519; GFX10-WGP: ; %bb.0: ; %entry 15520; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15521; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15522; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15523; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15524; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15525; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15526; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15527; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15528; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15529; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15530; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15531; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15532; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15533; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15534; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15535; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15536; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15537; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15538; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15539; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15540; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 15541; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15542; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15543; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15544; GFX10-WGP-NEXT: buffer_gl0_inv 15545; GFX10-WGP-NEXT: s_endpgm 15546; 15547; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15548; GFX10-CU: ; %bb.0: ; %entry 15549; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15550; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15551; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15552; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15553; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15554; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15555; GFX10-CU-NEXT: s_mov_b32 s4, s8 15556; GFX10-CU-NEXT: s_mov_b32 s5, s9 15557; GFX10-CU-NEXT: s_mov_b32 s9, s10 15558; GFX10-CU-NEXT: s_mov_b32 s8, s11 15559; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15560; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15561; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15562; GFX10-CU-NEXT: s_mov_b32 s5, s8 15563; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15564; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15565; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15566; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15567; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15568; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15569; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15570; GFX10-CU-NEXT: s_endpgm 15571; 15572; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15573; SKIP-CACHE-INV: ; %bb.0: ; %entry 15574; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15575; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15576; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15577; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15578; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15579; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15580; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15581; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15582; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15583; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15584; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15585; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15586; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15587; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15588; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15590; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15591; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15592; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15593; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15594; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15595; SKIP-CACHE-INV-NEXT: s_endpgm 15596; 15597; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15598; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15599; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15600; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15601; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15602; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15603; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15604; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15605; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15606; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15607; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15608; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15609; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15610; 15611; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15612; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15613; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15614; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15615; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15616; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15617; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15618; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15619; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15620; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15621; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15622; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15623; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15624; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15625; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 15626; GFX90A-TGSPLIT-NEXT: s_endpgm 15627; 15628; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15629; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15630; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15631; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15632; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15633; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15634; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15635; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15636; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15637; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15638; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15639; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15640; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15641; 15642; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15643; GFX940-TGSPLIT: ; %bb.0: ; %entry 15644; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15645; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15646; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15647; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15648; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15649; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15650; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15651; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15652; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15653; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15654; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15655; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15656; GFX940-TGSPLIT-NEXT: buffer_inv sc0 15657; GFX940-TGSPLIT-NEXT: s_endpgm 15658; 15659; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15660; GFX11-WGP: ; %bb.0: ; %entry 15661; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15662; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15663; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15664; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15665; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15666; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15667; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15668; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15669; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15670; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15671; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 15672; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15673; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15674; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15675; GFX11-WGP-NEXT: buffer_gl0_inv 15676; GFX11-WGP-NEXT: s_endpgm 15677; 15678; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15679; GFX11-CU: ; %bb.0: ; %entry 15680; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15681; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15682; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15683; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15684; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15685; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15686; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15687; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15688; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15689; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15690; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15691; GFX11-CU-NEXT: s_endpgm 15692; 15693; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15694; GFX12-WGP: ; %bb.0: ; %entry 15695; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15696; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15697; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15698; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15699; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15700; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15701; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15702; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15703; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15704; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15705; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 15706; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 15707; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 15708; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15709; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 15710; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15711; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 15712; GFX12-WGP-NEXT: s_endpgm 15713; 15714; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: 15715; GFX12-CU: ; %bb.0: ; %entry 15716; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15717; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15718; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15719; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15720; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15721; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15722; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15723; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15724; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15725; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15726; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15727; GFX12-CU-NEXT: s_endpgm 15728 ptr %out, i32 %in, i32 %old) { 15729entry: 15730 %gep = getelementptr i32, ptr %out, i32 4 15731 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst 15732 ret void 15733} 15734 15735define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( 15736; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15737; GFX7: ; %bb.0: ; %entry 15738; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15739; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15740; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15741; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15742; GFX7-NEXT: s_mov_b64 s[10:11], 16 15743; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15744; GFX7-NEXT: s_mov_b32 s4, s8 15745; GFX7-NEXT: s_mov_b32 s5, s9 15746; GFX7-NEXT: s_mov_b32 s9, s10 15747; GFX7-NEXT: s_mov_b32 s8, s11 15748; GFX7-NEXT: s_add_u32 s4, s4, s9 15749; GFX7-NEXT: s_addc_u32 s8, s5, s8 15750; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15751; GFX7-NEXT: s_mov_b32 s5, s8 15752; GFX7-NEXT: v_mov_b32_e32 v2, s7 15753; GFX7-NEXT: v_mov_b32_e32 v0, s6 15754; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15755; GFX7-NEXT: v_mov_b32_e32 v3, v0 15756; GFX7-NEXT: v_mov_b32_e32 v0, s4 15757; GFX7-NEXT: v_mov_b32_e32 v1, s5 15758; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15759; GFX7-NEXT: s_endpgm 15760; 15761; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15762; GFX10-WGP: ; %bb.0: ; %entry 15763; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15764; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15765; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15766; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15767; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15768; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15769; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15770; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15771; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15772; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15773; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15774; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15775; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15776; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15777; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15778; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15779; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15780; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15781; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15782; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15783; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 15784; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15785; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15786; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15787; GFX10-WGP-NEXT: buffer_gl0_inv 15788; GFX10-WGP-NEXT: s_endpgm 15789; 15790; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15791; GFX10-CU: ; %bb.0: ; %entry 15792; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15793; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15794; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15795; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15796; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15797; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15798; GFX10-CU-NEXT: s_mov_b32 s4, s8 15799; GFX10-CU-NEXT: s_mov_b32 s5, s9 15800; GFX10-CU-NEXT: s_mov_b32 s9, s10 15801; GFX10-CU-NEXT: s_mov_b32 s8, s11 15802; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15803; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15804; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15805; GFX10-CU-NEXT: s_mov_b32 s5, s8 15806; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15807; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15808; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15809; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15810; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15811; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15812; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15813; GFX10-CU-NEXT: s_endpgm 15814; 15815; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15816; SKIP-CACHE-INV: ; %bb.0: ; %entry 15817; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15818; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15819; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15820; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15821; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15822; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15823; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15824; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15825; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15826; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15827; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15828; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15829; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15830; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15831; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15832; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15833; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15834; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15835; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15836; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15837; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15838; SKIP-CACHE-INV-NEXT: s_endpgm 15839; 15840; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15841; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15842; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15843; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15844; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15845; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15846; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15847; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15848; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15849; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15850; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15851; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15852; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15853; 15854; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15855; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15856; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15857; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15858; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15859; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15860; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15861; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15862; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15863; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15864; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15865; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15866; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15867; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15868; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 15869; GFX90A-TGSPLIT-NEXT: s_endpgm 15870; 15871; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15872; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15873; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15874; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15875; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15876; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15877; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15878; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15879; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15880; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15881; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15882; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15883; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15884; 15885; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15886; GFX940-TGSPLIT: ; %bb.0: ; %entry 15887; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15888; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15889; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15890; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15891; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15892; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15893; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15894; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15895; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15896; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15897; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15898; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15899; GFX940-TGSPLIT-NEXT: buffer_inv sc0 15900; GFX940-TGSPLIT-NEXT: s_endpgm 15901; 15902; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15903; GFX11-WGP: ; %bb.0: ; %entry 15904; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15905; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15906; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15907; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15908; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15909; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15910; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15911; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15912; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15913; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15914; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 15915; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15916; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15917; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 15918; GFX11-WGP-NEXT: buffer_gl0_inv 15919; GFX11-WGP-NEXT: s_endpgm 15920; 15921; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15922; GFX11-CU: ; %bb.0: ; %entry 15923; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15924; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15925; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15926; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15927; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15928; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15929; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15930; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15931; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15932; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15933; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15934; GFX11-CU-NEXT: s_endpgm 15935; 15936; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15937; GFX12-WGP: ; %bb.0: ; %entry 15938; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15939; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15940; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15941; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15942; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15943; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15944; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15945; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15946; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15947; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15948; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 15949; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 15950; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 15951; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15952; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE 15953; GFX12-WGP-NEXT: s_wait_storecnt 0x0 15954; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 15955; GFX12-WGP-NEXT: s_endpgm 15956; 15957; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 15958; GFX12-CU: ; %bb.0: ; %entry 15959; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15960; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15961; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15962; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15963; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15964; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15965; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15966; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15967; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15968; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15969; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15970; GFX12-CU-NEXT: s_endpgm 15971 ptr %out, i32 %in, i32 %old) { 15972entry: 15973 %gep = getelementptr i32, ptr %out, i32 4 15974 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 15975 ret void 15976} 15977 15978define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( 15979; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 15980; GFX7: ; %bb.0: ; %entry 15981; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 15982; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15983; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 15984; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 15985; GFX7-NEXT: s_mov_b64 s[12:13], 16 15986; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15987; GFX7-NEXT: s_mov_b32 s6, s4 15988; GFX7-NEXT: s_mov_b32 s7, s5 15989; GFX7-NEXT: s_mov_b32 s11, s12 15990; GFX7-NEXT: s_mov_b32 s10, s13 15991; GFX7-NEXT: s_add_u32 s6, s6, s11 15992; GFX7-NEXT: s_addc_u32 s10, s7, s10 15993; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15994; GFX7-NEXT: s_mov_b32 s7, s10 15995; GFX7-NEXT: v_mov_b32_e32 v2, s9 15996; GFX7-NEXT: v_mov_b32_e32 v0, s8 15997; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15998; GFX7-NEXT: v_mov_b32_e32 v3, v0 15999; GFX7-NEXT: v_mov_b32_e32 v0, s6 16000; GFX7-NEXT: v_mov_b32_e32 v1, s7 16001; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16002; GFX7-NEXT: v_mov_b32_e32 v0, s4 16003; GFX7-NEXT: v_mov_b32_e32 v1, s5 16004; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16005; GFX7-NEXT: flat_store_dword v[0:1], v2 16006; GFX7-NEXT: s_endpgm 16007; 16008; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16009; GFX10-WGP: ; %bb.0: ; %entry 16010; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16011; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16012; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16013; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16014; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16015; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16016; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16017; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16018; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16019; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16020; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16021; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16022; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16023; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16024; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16025; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16026; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16027; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16028; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16029; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16030; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16031; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16032; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16033; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16034; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16035; GFX10-WGP-NEXT: s_endpgm 16036; 16037; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16038; GFX10-CU: ; %bb.0: ; %entry 16039; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16040; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16041; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16042; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16043; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16044; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16045; GFX10-CU-NEXT: s_mov_b32 s6, s4 16046; GFX10-CU-NEXT: s_mov_b32 s7, s5 16047; GFX10-CU-NEXT: s_mov_b32 s11, s12 16048; GFX10-CU-NEXT: s_mov_b32 s10, s13 16049; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16050; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16051; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16052; GFX10-CU-NEXT: s_mov_b32 s7, s10 16053; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16054; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16055; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16056; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16057; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16058; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16059; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16060; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16061; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16062; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16063; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16064; GFX10-CU-NEXT: s_endpgm 16065; 16066; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16067; SKIP-CACHE-INV: ; %bb.0: ; %entry 16068; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16069; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16070; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16071; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16072; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16073; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16074; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16075; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16076; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16077; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16078; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16079; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16080; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16081; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16082; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16083; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16084; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16085; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16086; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16088; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16091; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16092; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16093; SKIP-CACHE-INV-NEXT: s_endpgm 16094; 16095; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16096; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16097; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16098; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16099; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16100; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16101; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16102; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16103; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16104; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16105; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16106; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16107; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16109; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16110; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16111; 16112; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16113; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16114; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16115; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16116; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16117; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16118; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16119; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16120; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16121; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16122; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16123; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16124; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16125; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16126; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16127; GFX90A-TGSPLIT-NEXT: s_endpgm 16128; 16129; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16130; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16131; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16132; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16133; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16134; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16135; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16136; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16137; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16138; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16139; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16140; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16141; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16142; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16143; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16144; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16145; 16146; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16147; GFX940-TGSPLIT: ; %bb.0: ; %entry 16148; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16149; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16150; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16151; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16152; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16153; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16154; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16155; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16156; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16157; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16158; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16159; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16160; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16161; GFX940-TGSPLIT-NEXT: s_endpgm 16162; 16163; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16164; GFX11-WGP: ; %bb.0: ; %entry 16165; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16166; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16167; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16168; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16169; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16170; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16171; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16172; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16173; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16174; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16175; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16176; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16177; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16178; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16179; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16180; GFX11-WGP-NEXT: s_endpgm 16181; 16182; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16183; GFX11-CU: ; %bb.0: ; %entry 16184; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16185; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16186; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16187; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16188; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16189; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16190; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16191; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16192; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16193; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16194; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16195; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16196; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16197; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16198; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16199; GFX11-CU-NEXT: s_endpgm 16200; 16201; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16202; GFX12-WGP: ; %bb.0: ; %entry 16203; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16204; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16205; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16206; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16207; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16208; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16209; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16210; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16211; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16212; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16213; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 16214; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16215; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16216; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16217; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16218; GFX12-WGP-NEXT: s_endpgm 16219; 16220; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: 16221; GFX12-CU: ; %bb.0: ; %entry 16222; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16223; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16224; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16225; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16226; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16227; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16228; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16229; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16230; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16231; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16232; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16233; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16234; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16235; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16236; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16237; GFX12-CU-NEXT: s_endpgm 16238 ptr %out, i32 %in, i32 %old) { 16239entry: 16240 %gep = getelementptr i32, ptr %out, i32 4 16241 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic 16242 %val0 = extractvalue { i32, i1 } %val, 0 16243 store i32 %val0, ptr %out, align 4 16244 ret void 16245} 16246 16247define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( 16248; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16249; GFX7: ; %bb.0: ; %entry 16250; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16251; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16252; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16253; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16254; GFX7-NEXT: s_mov_b64 s[12:13], 16 16255; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16256; GFX7-NEXT: s_mov_b32 s6, s4 16257; GFX7-NEXT: s_mov_b32 s7, s5 16258; GFX7-NEXT: s_mov_b32 s11, s12 16259; GFX7-NEXT: s_mov_b32 s10, s13 16260; GFX7-NEXT: s_add_u32 s6, s6, s11 16261; GFX7-NEXT: s_addc_u32 s10, s7, s10 16262; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16263; GFX7-NEXT: s_mov_b32 s7, s10 16264; GFX7-NEXT: v_mov_b32_e32 v2, s9 16265; GFX7-NEXT: v_mov_b32_e32 v0, s8 16266; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16267; GFX7-NEXT: v_mov_b32_e32 v3, v0 16268; GFX7-NEXT: v_mov_b32_e32 v0, s6 16269; GFX7-NEXT: v_mov_b32_e32 v1, s7 16270; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16271; GFX7-NEXT: v_mov_b32_e32 v0, s4 16272; GFX7-NEXT: v_mov_b32_e32 v1, s5 16273; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16274; GFX7-NEXT: flat_store_dword v[0:1], v2 16275; GFX7-NEXT: s_endpgm 16276; 16277; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16278; GFX10-WGP: ; %bb.0: ; %entry 16279; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16280; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16281; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16282; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16283; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16284; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16285; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16286; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16287; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16288; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16289; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16290; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16291; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16292; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16293; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16294; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16295; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16296; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16297; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16298; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16299; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16300; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 16301; GFX10-WGP-NEXT: buffer_gl0_inv 16302; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16303; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16304; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16305; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16306; GFX10-WGP-NEXT: s_endpgm 16307; 16308; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16309; GFX10-CU: ; %bb.0: ; %entry 16310; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16311; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16312; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16313; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16314; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16315; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16316; GFX10-CU-NEXT: s_mov_b32 s6, s4 16317; GFX10-CU-NEXT: s_mov_b32 s7, s5 16318; GFX10-CU-NEXT: s_mov_b32 s11, s12 16319; GFX10-CU-NEXT: s_mov_b32 s10, s13 16320; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16321; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16322; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16323; GFX10-CU-NEXT: s_mov_b32 s7, s10 16324; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16325; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16326; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16327; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16328; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16329; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16330; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16331; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16332; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16333; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16334; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16335; GFX10-CU-NEXT: s_endpgm 16336; 16337; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16338; SKIP-CACHE-INV: ; %bb.0: ; %entry 16339; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16340; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16341; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16342; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16343; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16344; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16345; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16346; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16347; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16348; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16349; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16350; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16351; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16352; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16353; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16354; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16355; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16357; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16358; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16359; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16360; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16361; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16362; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16363; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16364; SKIP-CACHE-INV-NEXT: s_endpgm 16365; 16366; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16367; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16368; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16369; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16370; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16371; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16372; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16373; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16374; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16375; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16376; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16377; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16378; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16379; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16380; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16381; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16382; 16383; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16384; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16385; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16386; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16387; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16388; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16389; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16390; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16391; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16392; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16393; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16394; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16395; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16396; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 16397; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16398; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16399; GFX90A-TGSPLIT-NEXT: s_endpgm 16400; 16401; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16402; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16403; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16404; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16405; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16406; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16407; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16408; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16409; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16410; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16411; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16412; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16413; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16414; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16415; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16416; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16417; 16418; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16419; GFX940-TGSPLIT: ; %bb.0: ; %entry 16420; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16421; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16422; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16423; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16424; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16425; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16426; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16427; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16428; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16429; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16430; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16431; GFX940-TGSPLIT-NEXT: buffer_inv sc0 16432; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16433; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16434; GFX940-TGSPLIT-NEXT: s_endpgm 16435; 16436; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16437; GFX11-WGP: ; %bb.0: ; %entry 16438; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16439; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16440; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16441; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16442; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16443; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16444; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16445; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16446; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16447; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16448; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16449; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 16450; GFX11-WGP-NEXT: buffer_gl0_inv 16451; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16452; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16453; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16454; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16455; GFX11-WGP-NEXT: s_endpgm 16456; 16457; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16458; GFX11-CU: ; %bb.0: ; %entry 16459; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16460; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16461; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16462; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16463; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16464; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16465; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16466; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16467; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16468; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16469; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16470; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16471; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16472; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16473; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16474; GFX11-CU-NEXT: s_endpgm 16475; 16476; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16477; GFX12-WGP: ; %bb.0: ; %entry 16478; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16479; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16480; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16481; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16482; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16483; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16484; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16485; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16486; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16487; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16488; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 16489; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 16490; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 16491; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16492; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16493; GFX12-WGP-NEXT: s_wait_dscnt 0x0 16494; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16495; GFX12-WGP-NEXT: s_endpgm 16496; 16497; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 16498; GFX12-CU: ; %bb.0: ; %entry 16499; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16500; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16501; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16502; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16503; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16504; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16505; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16506; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16507; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16508; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16509; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16510; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16511; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16512; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16513; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16514; GFX12-CU-NEXT: s_endpgm 16515 ptr %out, i32 %in, i32 %old) { 16516entry: 16517 %gep = getelementptr i32, ptr %out, i32 4 16518 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 16519 %val0 = extractvalue { i32, i1 } %val, 0 16520 store i32 %val0, ptr %out, align 4 16521 ret void 16522} 16523 16524define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( 16525; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16526; GFX7: ; %bb.0: ; %entry 16527; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16528; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16529; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16530; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16531; GFX7-NEXT: s_mov_b64 s[12:13], 16 16532; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16533; GFX7-NEXT: s_mov_b32 s6, s4 16534; GFX7-NEXT: s_mov_b32 s7, s5 16535; GFX7-NEXT: s_mov_b32 s11, s12 16536; GFX7-NEXT: s_mov_b32 s10, s13 16537; GFX7-NEXT: s_add_u32 s6, s6, s11 16538; GFX7-NEXT: s_addc_u32 s10, s7, s10 16539; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16540; GFX7-NEXT: s_mov_b32 s7, s10 16541; GFX7-NEXT: v_mov_b32_e32 v2, s9 16542; GFX7-NEXT: v_mov_b32_e32 v0, s8 16543; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16544; GFX7-NEXT: v_mov_b32_e32 v3, v0 16545; GFX7-NEXT: v_mov_b32_e32 v0, s6 16546; GFX7-NEXT: v_mov_b32_e32 v1, s7 16547; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16548; GFX7-NEXT: v_mov_b32_e32 v0, s4 16549; GFX7-NEXT: v_mov_b32_e32 v1, s5 16550; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16551; GFX7-NEXT: flat_store_dword v[0:1], v2 16552; GFX7-NEXT: s_endpgm 16553; 16554; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16555; GFX10-WGP: ; %bb.0: ; %entry 16556; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16557; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16558; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16559; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16560; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16561; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16562; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16563; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16564; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16565; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16566; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16567; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16568; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16569; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16570; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16571; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16572; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16573; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16574; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16575; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16576; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 16577; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 16578; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16579; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16580; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16581; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16582; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16583; GFX10-WGP-NEXT: s_endpgm 16584; 16585; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16586; GFX10-CU: ; %bb.0: ; %entry 16587; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16588; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16589; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16590; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16591; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16592; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16593; GFX10-CU-NEXT: s_mov_b32 s6, s4 16594; GFX10-CU-NEXT: s_mov_b32 s7, s5 16595; GFX10-CU-NEXT: s_mov_b32 s11, s12 16596; GFX10-CU-NEXT: s_mov_b32 s10, s13 16597; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16598; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16599; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16600; GFX10-CU-NEXT: s_mov_b32 s7, s10 16601; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16602; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16603; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16604; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16605; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16606; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16607; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16608; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16609; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16610; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16611; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16612; GFX10-CU-NEXT: s_endpgm 16613; 16614; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16615; SKIP-CACHE-INV: ; %bb.0: ; %entry 16616; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16617; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16618; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16619; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16620; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16621; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16622; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16623; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16624; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16625; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16626; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16627; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16628; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16629; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16630; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16631; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16632; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16633; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16634; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16635; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16636; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16637; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16638; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16639; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16640; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16641; SKIP-CACHE-INV-NEXT: s_endpgm 16642; 16643; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16644; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16645; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16646; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16647; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16648; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16649; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16650; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16651; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16652; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16653; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16654; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16655; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16656; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16657; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16658; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16659; 16660; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16661; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16662; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16663; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16664; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16665; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16666; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16667; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16668; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16669; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16670; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16671; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16672; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16673; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16674; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16675; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16676; GFX90A-TGSPLIT-NEXT: s_endpgm 16677; 16678; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16679; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16680; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16681; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16682; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16683; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16684; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16685; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16686; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16687; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16688; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16689; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16690; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16691; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16692; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16693; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16694; 16695; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16696; GFX940-TGSPLIT: ; %bb.0: ; %entry 16697; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16698; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16699; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16700; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16701; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16702; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16703; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16704; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16705; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16706; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16707; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16708; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16709; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16710; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16711; GFX940-TGSPLIT-NEXT: s_endpgm 16712; 16713; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16714; GFX11-WGP: ; %bb.0: ; %entry 16715; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16716; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16717; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16718; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16719; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16720; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16721; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16722; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16723; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16724; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16725; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 16726; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 16727; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16728; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16729; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16730; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16731; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16732; GFX11-WGP-NEXT: s_endpgm 16733; 16734; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16735; GFX11-CU: ; %bb.0: ; %entry 16736; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16737; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16738; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16739; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16740; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16741; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16742; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16743; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16744; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16745; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16746; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16747; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16748; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16749; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16750; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16751; GFX11-CU-NEXT: s_endpgm 16752; 16753; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16754; GFX12-WGP: ; %bb.0: ; %entry 16755; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16756; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16757; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16758; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16759; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16760; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16761; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16762; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16763; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16764; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16765; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 16766; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 16767; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 16768; GFX12-WGP-NEXT: s_wait_storecnt 0x0 16769; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 16770; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16771; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16772; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16773; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16774; GFX12-WGP-NEXT: s_endpgm 16775; 16776; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: 16777; GFX12-CU: ; %bb.0: ; %entry 16778; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16779; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16780; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16781; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16782; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16783; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16784; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16785; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16786; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16787; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16788; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16789; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16790; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16791; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16792; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16793; GFX12-CU-NEXT: s_endpgm 16794 ptr %out, i32 %in, i32 %old) { 16795entry: 16796 %gep = getelementptr i32, ptr %out, i32 4 16797 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic 16798 %val0 = extractvalue { i32, i1 } %val, 0 16799 store i32 %val0, ptr %out, align 4 16800 ret void 16801} 16802 16803define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( 16804; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16805; GFX7: ; %bb.0: ; %entry 16806; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16807; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16808; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16809; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16810; GFX7-NEXT: s_mov_b64 s[12:13], 16 16811; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16812; GFX7-NEXT: s_mov_b32 s6, s4 16813; GFX7-NEXT: s_mov_b32 s7, s5 16814; GFX7-NEXT: s_mov_b32 s11, s12 16815; GFX7-NEXT: s_mov_b32 s10, s13 16816; GFX7-NEXT: s_add_u32 s6, s6, s11 16817; GFX7-NEXT: s_addc_u32 s10, s7, s10 16818; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16819; GFX7-NEXT: s_mov_b32 s7, s10 16820; GFX7-NEXT: v_mov_b32_e32 v2, s9 16821; GFX7-NEXT: v_mov_b32_e32 v0, s8 16822; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16823; GFX7-NEXT: v_mov_b32_e32 v3, v0 16824; GFX7-NEXT: v_mov_b32_e32 v0, s6 16825; GFX7-NEXT: v_mov_b32_e32 v1, s7 16826; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16827; GFX7-NEXT: v_mov_b32_e32 v0, s4 16828; GFX7-NEXT: v_mov_b32_e32 v1, s5 16829; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16830; GFX7-NEXT: flat_store_dword v[0:1], v2 16831; GFX7-NEXT: s_endpgm 16832; 16833; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16834; GFX10-WGP: ; %bb.0: ; %entry 16835; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16836; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16837; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16838; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16839; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16840; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16841; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16842; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16843; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16844; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16845; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16846; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16847; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16848; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16849; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16850; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16851; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16852; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16853; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16854; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16855; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 16856; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 16857; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16858; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 16859; GFX10-WGP-NEXT: buffer_gl0_inv 16860; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16861; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16862; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16863; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16864; GFX10-WGP-NEXT: s_endpgm 16865; 16866; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16867; GFX10-CU: ; %bb.0: ; %entry 16868; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16869; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16870; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16871; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16872; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16873; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16874; GFX10-CU-NEXT: s_mov_b32 s6, s4 16875; GFX10-CU-NEXT: s_mov_b32 s7, s5 16876; GFX10-CU-NEXT: s_mov_b32 s11, s12 16877; GFX10-CU-NEXT: s_mov_b32 s10, s13 16878; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16879; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16880; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16881; GFX10-CU-NEXT: s_mov_b32 s7, s10 16882; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16883; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16884; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16885; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16886; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16887; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16888; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16889; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16890; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16891; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16892; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16893; GFX10-CU-NEXT: s_endpgm 16894; 16895; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16896; SKIP-CACHE-INV: ; %bb.0: ; %entry 16897; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16898; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16899; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16900; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16901; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16902; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16903; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16904; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16905; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16906; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16907; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16908; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16909; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16910; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16911; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16913; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16914; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16917; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16919; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16920; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16921; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16922; SKIP-CACHE-INV-NEXT: s_endpgm 16923; 16924; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16925; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16926; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16927; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16928; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16929; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16930; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16931; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16932; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16933; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16934; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16935; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16936; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16937; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16938; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16939; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16940; 16941; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16942; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16943; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16944; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16945; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16946; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16947; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16948; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16949; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16950; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16951; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16952; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16953; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16954; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16955; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 16956; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16957; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16958; GFX90A-TGSPLIT-NEXT: s_endpgm 16959; 16960; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16961; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16962; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16963; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16964; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16965; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16966; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16967; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16968; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16969; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16970; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16971; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16972; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16973; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16974; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16975; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16976; 16977; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16978; GFX940-TGSPLIT: ; %bb.0: ; %entry 16979; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16980; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16981; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16982; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16983; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16984; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16985; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16986; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16987; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16988; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16989; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16990; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16991; GFX940-TGSPLIT-NEXT: buffer_inv sc0 16992; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16993; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16994; GFX940-TGSPLIT-NEXT: s_endpgm 16995; 16996; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 16997; GFX11-WGP: ; %bb.0: ; %entry 16998; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16999; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17000; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17001; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17002; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17003; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17004; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17005; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17006; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17007; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17008; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17009; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 17010; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17011; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17012; GFX11-WGP-NEXT: buffer_gl0_inv 17013; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17014; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17015; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17016; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17017; GFX11-WGP-NEXT: s_endpgm 17018; 17019; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 17020; GFX11-CU: ; %bb.0: ; %entry 17021; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17022; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17023; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17024; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17025; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17026; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17027; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17028; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17029; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17030; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17031; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17032; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17033; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17034; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17035; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17036; GFX11-CU-NEXT: s_endpgm 17037; 17038; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 17039; GFX12-WGP: ; %bb.0: ; %entry 17040; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17041; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17042; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17043; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17044; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17045; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17046; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17047; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17048; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17049; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17050; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 17051; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 17052; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17053; GFX12-WGP-NEXT: s_wait_storecnt 0x0 17054; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 17055; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 17056; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 17057; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17058; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 17059; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17060; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17061; GFX12-WGP-NEXT: s_wait_dscnt 0x0 17062; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17063; GFX12-WGP-NEXT: s_endpgm 17064; 17065; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 17066; GFX12-CU: ; %bb.0: ; %entry 17067; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17068; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17069; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17070; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17071; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17072; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17073; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17074; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17075; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17076; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17077; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17078; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17079; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17080; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17081; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17082; GFX12-CU-NEXT: s_endpgm 17083 ptr %out, i32 %in, i32 %old) { 17084entry: 17085 %gep = getelementptr i32, ptr %out, i32 4 17086 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 17087 %val0 = extractvalue { i32, i1 } %val, 0 17088 store i32 %val0, ptr %out, align 4 17089 ret void 17090} 17091 17092define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( 17093; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17094; GFX7: ; %bb.0: ; %entry 17095; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17096; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17097; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17098; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17099; GFX7-NEXT: s_mov_b64 s[12:13], 16 17100; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17101; GFX7-NEXT: s_mov_b32 s6, s4 17102; GFX7-NEXT: s_mov_b32 s7, s5 17103; GFX7-NEXT: s_mov_b32 s11, s12 17104; GFX7-NEXT: s_mov_b32 s10, s13 17105; GFX7-NEXT: s_add_u32 s6, s6, s11 17106; GFX7-NEXT: s_addc_u32 s10, s7, s10 17107; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17108; GFX7-NEXT: s_mov_b32 s7, s10 17109; GFX7-NEXT: v_mov_b32_e32 v2, s9 17110; GFX7-NEXT: v_mov_b32_e32 v0, s8 17111; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17112; GFX7-NEXT: v_mov_b32_e32 v3, v0 17113; GFX7-NEXT: v_mov_b32_e32 v0, s6 17114; GFX7-NEXT: v_mov_b32_e32 v1, s7 17115; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17116; GFX7-NEXT: v_mov_b32_e32 v0, s4 17117; GFX7-NEXT: v_mov_b32_e32 v1, s5 17118; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17119; GFX7-NEXT: flat_store_dword v[0:1], v2 17120; GFX7-NEXT: s_endpgm 17121; 17122; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17123; GFX10-WGP: ; %bb.0: ; %entry 17124; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17125; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17126; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17127; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17128; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17129; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17130; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17131; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17132; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17133; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17134; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17135; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17136; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17137; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17138; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17139; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17140; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17141; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17142; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17143; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17144; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17145; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 17146; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17147; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17148; GFX10-WGP-NEXT: buffer_gl0_inv 17149; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17150; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17151; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17152; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17153; GFX10-WGP-NEXT: s_endpgm 17154; 17155; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17156; GFX10-CU: ; %bb.0: ; %entry 17157; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17158; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17159; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17160; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17161; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17162; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17163; GFX10-CU-NEXT: s_mov_b32 s6, s4 17164; GFX10-CU-NEXT: s_mov_b32 s7, s5 17165; GFX10-CU-NEXT: s_mov_b32 s11, s12 17166; GFX10-CU-NEXT: s_mov_b32 s10, s13 17167; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17168; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17169; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17170; GFX10-CU-NEXT: s_mov_b32 s7, s10 17171; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17172; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17173; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17174; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17175; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17176; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17177; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17178; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17179; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17180; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17181; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17182; GFX10-CU-NEXT: s_endpgm 17183; 17184; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17185; SKIP-CACHE-INV: ; %bb.0: ; %entry 17186; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17187; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17188; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17189; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17190; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17191; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17192; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17193; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17194; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17195; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17196; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17197; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17198; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17199; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17200; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17201; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17202; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17203; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17205; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17206; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17207; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17208; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17209; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17210; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17211; SKIP-CACHE-INV-NEXT: s_endpgm 17212; 17213; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17214; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17215; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17216; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17217; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17218; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17219; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17220; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17221; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17222; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17223; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17224; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17225; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17226; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17227; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17228; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17229; 17230; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17231; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17232; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17233; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17234; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17235; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17236; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17237; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17238; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17239; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17240; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17241; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17242; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17243; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17244; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 17245; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17246; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17247; GFX90A-TGSPLIT-NEXT: s_endpgm 17248; 17249; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17250; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17251; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17252; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17253; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17254; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17255; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17256; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17257; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17258; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17259; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17260; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17261; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17262; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17263; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17264; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17265; 17266; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17267; GFX940-TGSPLIT: ; %bb.0: ; %entry 17268; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17269; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17270; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17271; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17272; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17273; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17274; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17275; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17276; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17277; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17278; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17279; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17280; GFX940-TGSPLIT-NEXT: buffer_inv sc0 17281; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17282; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17283; GFX940-TGSPLIT-NEXT: s_endpgm 17284; 17285; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17286; GFX11-WGP: ; %bb.0: ; %entry 17287; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17288; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17289; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17290; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17291; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17292; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17293; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17294; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17295; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17296; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17297; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17298; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 17299; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17300; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17301; GFX11-WGP-NEXT: buffer_gl0_inv 17302; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17303; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17304; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17305; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17306; GFX11-WGP-NEXT: s_endpgm 17307; 17308; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17309; GFX11-CU: ; %bb.0: ; %entry 17310; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17311; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17312; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17313; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17314; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17315; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17316; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17317; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17318; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17319; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17320; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17321; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17322; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17323; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17324; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17325; GFX11-CU-NEXT: s_endpgm 17326; 17327; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17328; GFX12-WGP: ; %bb.0: ; %entry 17329; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17330; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17331; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17332; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17333; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17334; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17335; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17336; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17337; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17338; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17339; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 17340; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 17341; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17342; GFX12-WGP-NEXT: s_wait_storecnt 0x0 17343; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 17344; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 17345; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 17346; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17347; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 17348; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17349; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17350; GFX12-WGP-NEXT: s_wait_dscnt 0x0 17351; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17352; GFX12-WGP-NEXT: s_endpgm 17353; 17354; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 17355; GFX12-CU: ; %bb.0: ; %entry 17356; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17357; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17358; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17359; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17360; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17361; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17362; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17363; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17364; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17365; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17366; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17367; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17368; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17369; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17370; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17371; GFX12-CU-NEXT: s_endpgm 17372 ptr %out, i32 %in, i32 %old) { 17373entry: 17374 %gep = getelementptr i32, ptr %out, i32 4 17375 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 17376 %val0 = extractvalue { i32, i1 } %val, 0 17377 store i32 %val0, ptr %out, align 4 17378 ret void 17379} 17380 17381define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( 17382; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17383; GFX7: ; %bb.0: ; %entry 17384; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17385; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17386; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17387; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17388; GFX7-NEXT: s_mov_b64 s[12:13], 16 17389; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17390; GFX7-NEXT: s_mov_b32 s6, s4 17391; GFX7-NEXT: s_mov_b32 s7, s5 17392; GFX7-NEXT: s_mov_b32 s11, s12 17393; GFX7-NEXT: s_mov_b32 s10, s13 17394; GFX7-NEXT: s_add_u32 s6, s6, s11 17395; GFX7-NEXT: s_addc_u32 s10, s7, s10 17396; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17397; GFX7-NEXT: s_mov_b32 s7, s10 17398; GFX7-NEXT: v_mov_b32_e32 v2, s9 17399; GFX7-NEXT: v_mov_b32_e32 v0, s8 17400; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17401; GFX7-NEXT: v_mov_b32_e32 v3, v0 17402; GFX7-NEXT: v_mov_b32_e32 v0, s6 17403; GFX7-NEXT: v_mov_b32_e32 v1, s7 17404; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17405; GFX7-NEXT: v_mov_b32_e32 v0, s4 17406; GFX7-NEXT: v_mov_b32_e32 v1, s5 17407; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17408; GFX7-NEXT: flat_store_dword v[0:1], v2 17409; GFX7-NEXT: s_endpgm 17410; 17411; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17412; GFX10-WGP: ; %bb.0: ; %entry 17413; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17414; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17415; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17416; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17417; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17418; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17419; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17420; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17421; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17422; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17423; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17424; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17425; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17426; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17427; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17428; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17429; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17430; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17431; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17432; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17433; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17434; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17435; GFX10-WGP-NEXT: buffer_gl0_inv 17436; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17437; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17438; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17439; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17440; GFX10-WGP-NEXT: s_endpgm 17441; 17442; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17443; GFX10-CU: ; %bb.0: ; %entry 17444; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17445; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17446; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17447; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17448; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17449; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17450; GFX10-CU-NEXT: s_mov_b32 s6, s4 17451; GFX10-CU-NEXT: s_mov_b32 s7, s5 17452; GFX10-CU-NEXT: s_mov_b32 s11, s12 17453; GFX10-CU-NEXT: s_mov_b32 s10, s13 17454; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17455; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17456; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17457; GFX10-CU-NEXT: s_mov_b32 s7, s10 17458; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17459; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17460; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17461; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17462; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17463; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17464; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17465; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17466; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17467; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17468; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17469; GFX10-CU-NEXT: s_endpgm 17470; 17471; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17472; SKIP-CACHE-INV: ; %bb.0: ; %entry 17473; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17474; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17475; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17476; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17477; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17478; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17479; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17480; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17481; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17482; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17483; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17484; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17485; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17486; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17487; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17488; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17489; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17491; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17493; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17495; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17496; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17497; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17498; SKIP-CACHE-INV-NEXT: s_endpgm 17499; 17500; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17501; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17502; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17503; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17504; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17505; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17506; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17507; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17508; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17509; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17510; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17511; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17512; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17513; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17514; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17515; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17516; 17517; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17518; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17519; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17520; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17521; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17522; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17523; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17524; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17525; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17526; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17527; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17528; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17529; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17530; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 17531; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17532; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17533; GFX90A-TGSPLIT-NEXT: s_endpgm 17534; 17535; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17536; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17537; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17538; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17539; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17540; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17541; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17542; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17543; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17544; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17545; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17546; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17547; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17548; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17549; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17550; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17551; 17552; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17553; GFX940-TGSPLIT: ; %bb.0: ; %entry 17554; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17555; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17556; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17557; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17558; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17559; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17560; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17561; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17562; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17563; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17564; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17565; GFX940-TGSPLIT-NEXT: buffer_inv sc0 17566; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17567; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17568; GFX940-TGSPLIT-NEXT: s_endpgm 17569; 17570; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17571; GFX11-WGP: ; %bb.0: ; %entry 17572; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17573; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17574; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17575; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17576; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17577; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17578; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17579; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17580; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17581; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17582; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17583; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17584; GFX11-WGP-NEXT: buffer_gl0_inv 17585; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17586; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17587; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17588; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17589; GFX11-WGP-NEXT: s_endpgm 17590; 17591; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17592; GFX11-CU: ; %bb.0: ; %entry 17593; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17594; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17595; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17596; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17597; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17598; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17599; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17600; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17601; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17602; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17603; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17604; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17605; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17606; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17607; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17608; GFX11-CU-NEXT: s_endpgm 17609; 17610; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17611; GFX12-WGP: ; %bb.0: ; %entry 17612; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17613; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17614; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17615; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17616; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17617; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17618; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17619; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17620; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17621; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17622; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 17623; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 17624; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 17625; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17626; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 17627; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17628; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17629; GFX12-WGP-NEXT: s_wait_dscnt 0x0 17630; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17631; GFX12-WGP-NEXT: s_endpgm 17632; 17633; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: 17634; GFX12-CU: ; %bb.0: ; %entry 17635; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17636; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17637; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17638; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17639; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17640; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17641; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17642; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17643; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17644; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17645; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17646; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17647; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17648; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17649; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17650; GFX12-CU-NEXT: s_endpgm 17651 ptr %out, i32 %in, i32 %old) { 17652entry: 17653 %gep = getelementptr i32, ptr %out, i32 4 17654 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire 17655 %val0 = extractvalue { i32, i1 } %val, 0 17656 store i32 %val0, ptr %out, align 4 17657 ret void 17658} 17659 17660define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( 17661; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17662; GFX7: ; %bb.0: ; %entry 17663; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17664; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17665; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17666; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17667; GFX7-NEXT: s_mov_b64 s[12:13], 16 17668; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17669; GFX7-NEXT: s_mov_b32 s6, s4 17670; GFX7-NEXT: s_mov_b32 s7, s5 17671; GFX7-NEXT: s_mov_b32 s11, s12 17672; GFX7-NEXT: s_mov_b32 s10, s13 17673; GFX7-NEXT: s_add_u32 s6, s6, s11 17674; GFX7-NEXT: s_addc_u32 s10, s7, s10 17675; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17676; GFX7-NEXT: s_mov_b32 s7, s10 17677; GFX7-NEXT: v_mov_b32_e32 v2, s9 17678; GFX7-NEXT: v_mov_b32_e32 v0, s8 17679; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17680; GFX7-NEXT: v_mov_b32_e32 v3, v0 17681; GFX7-NEXT: v_mov_b32_e32 v0, s6 17682; GFX7-NEXT: v_mov_b32_e32 v1, s7 17683; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17684; GFX7-NEXT: v_mov_b32_e32 v0, s4 17685; GFX7-NEXT: v_mov_b32_e32 v1, s5 17686; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17687; GFX7-NEXT: flat_store_dword v[0:1], v2 17688; GFX7-NEXT: s_endpgm 17689; 17690; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17691; GFX10-WGP: ; %bb.0: ; %entry 17692; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17693; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17694; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17695; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17696; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17697; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17698; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17699; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17700; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17701; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17702; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17703; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17704; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17705; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17706; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17707; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17708; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17709; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17710; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17711; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17712; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17713; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17714; GFX10-WGP-NEXT: buffer_gl0_inv 17715; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17716; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17717; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17718; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17719; GFX10-WGP-NEXT: s_endpgm 17720; 17721; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17722; GFX10-CU: ; %bb.0: ; %entry 17723; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17724; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17725; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17726; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17727; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17728; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17729; GFX10-CU-NEXT: s_mov_b32 s6, s4 17730; GFX10-CU-NEXT: s_mov_b32 s7, s5 17731; GFX10-CU-NEXT: s_mov_b32 s11, s12 17732; GFX10-CU-NEXT: s_mov_b32 s10, s13 17733; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17734; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17735; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17736; GFX10-CU-NEXT: s_mov_b32 s7, s10 17737; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17738; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17739; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17740; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17741; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17742; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17743; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17744; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17745; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17746; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17747; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17748; GFX10-CU-NEXT: s_endpgm 17749; 17750; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17751; SKIP-CACHE-INV: ; %bb.0: ; %entry 17752; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17753; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17754; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17755; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17756; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17757; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17758; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17759; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17760; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17761; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17762; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17763; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17764; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17765; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17767; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17768; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17769; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17770; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17771; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17772; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17773; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17774; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17775; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17776; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17777; SKIP-CACHE-INV-NEXT: s_endpgm 17778; 17779; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17780; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17781; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17782; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17783; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17784; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17785; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17786; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17787; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17788; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17789; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17790; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17791; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17792; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17793; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17794; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17795; 17796; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17797; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17798; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17799; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17800; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17801; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17802; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17803; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17804; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17805; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17806; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17807; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17808; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17809; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 17810; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17811; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17812; GFX90A-TGSPLIT-NEXT: s_endpgm 17813; 17814; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17815; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17816; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17817; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17818; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17819; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17820; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17821; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17822; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17823; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17824; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17825; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17826; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17827; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17828; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17829; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17830; 17831; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17832; GFX940-TGSPLIT: ; %bb.0: ; %entry 17833; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17834; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17835; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17836; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17837; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17838; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17839; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17840; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17841; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17842; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17843; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17844; GFX940-TGSPLIT-NEXT: buffer_inv sc0 17845; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17846; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17847; GFX940-TGSPLIT-NEXT: s_endpgm 17848; 17849; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17850; GFX11-WGP: ; %bb.0: ; %entry 17851; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17852; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17853; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17854; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17855; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17856; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17857; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17858; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17859; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17860; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17861; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17862; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 17863; GFX11-WGP-NEXT: buffer_gl0_inv 17864; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17865; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17866; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17867; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17868; GFX11-WGP-NEXT: s_endpgm 17869; 17870; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17871; GFX11-CU: ; %bb.0: ; %entry 17872; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17873; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17874; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17875; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17876; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17877; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17878; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17879; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17880; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17881; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17882; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17883; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17884; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17885; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17886; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17887; GFX11-CU-NEXT: s_endpgm 17888; 17889; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17890; GFX12-WGP: ; %bb.0: ; %entry 17891; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17892; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17893; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17894; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17895; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17896; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17897; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17898; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17899; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17900; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17901; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 17902; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 17903; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 17904; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17905; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17906; GFX12-WGP-NEXT: s_wait_dscnt 0x0 17907; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17908; GFX12-WGP-NEXT: s_endpgm 17909; 17910; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: 17911; GFX12-CU: ; %bb.0: ; %entry 17912; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17913; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17914; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17915; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17916; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17917; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17918; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17919; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17920; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17921; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17922; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17923; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17924; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17925; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17926; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17927; GFX12-CU-NEXT: s_endpgm 17928 ptr %out, i32 %in, i32 %old) { 17929entry: 17930 %gep = getelementptr i32, ptr %out, i32 4 17931 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 17932 %val0 = extractvalue { i32, i1 } %val, 0 17933 store i32 %val0, ptr %out, align 4 17934 ret void 17935} 17936 17937define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( 17938; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 17939; GFX7: ; %bb.0: ; %entry 17940; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17941; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17942; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17943; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17944; GFX7-NEXT: s_mov_b64 s[12:13], 16 17945; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17946; GFX7-NEXT: s_mov_b32 s6, s4 17947; GFX7-NEXT: s_mov_b32 s7, s5 17948; GFX7-NEXT: s_mov_b32 s11, s12 17949; GFX7-NEXT: s_mov_b32 s10, s13 17950; GFX7-NEXT: s_add_u32 s6, s6, s11 17951; GFX7-NEXT: s_addc_u32 s10, s7, s10 17952; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17953; GFX7-NEXT: s_mov_b32 s7, s10 17954; GFX7-NEXT: v_mov_b32_e32 v2, s9 17955; GFX7-NEXT: v_mov_b32_e32 v0, s8 17956; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17957; GFX7-NEXT: v_mov_b32_e32 v3, v0 17958; GFX7-NEXT: v_mov_b32_e32 v0, s6 17959; GFX7-NEXT: v_mov_b32_e32 v1, s7 17960; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17961; GFX7-NEXT: v_mov_b32_e32 v0, s4 17962; GFX7-NEXT: v_mov_b32_e32 v1, s5 17963; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17964; GFX7-NEXT: flat_store_dword v[0:1], v2 17965; GFX7-NEXT: s_endpgm 17966; 17967; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 17968; GFX10-WGP: ; %bb.0: ; %entry 17969; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17970; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17971; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17972; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17973; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17974; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17975; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17976; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17977; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17978; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17979; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17980; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17981; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17982; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17983; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17984; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17985; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17986; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17987; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17988; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17989; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17990; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 17991; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17992; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 17993; GFX10-WGP-NEXT: buffer_gl0_inv 17994; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17995; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17996; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17997; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17998; GFX10-WGP-NEXT: s_endpgm 17999; 18000; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18001; GFX10-CU: ; %bb.0: ; %entry 18002; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18003; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18004; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18005; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18006; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18007; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18008; GFX10-CU-NEXT: s_mov_b32 s6, s4 18009; GFX10-CU-NEXT: s_mov_b32 s7, s5 18010; GFX10-CU-NEXT: s_mov_b32 s11, s12 18011; GFX10-CU-NEXT: s_mov_b32 s10, s13 18012; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18013; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18014; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18015; GFX10-CU-NEXT: s_mov_b32 s7, s10 18016; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18017; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18018; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18019; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18020; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18021; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18022; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18023; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18024; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18025; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18026; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18027; GFX10-CU-NEXT: s_endpgm 18028; 18029; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18030; SKIP-CACHE-INV: ; %bb.0: ; %entry 18031; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18032; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18033; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18034; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18035; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18036; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18037; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18038; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18039; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18040; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18041; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18042; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18043; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18044; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18045; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18046; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18047; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18048; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18049; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18050; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18051; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18052; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18053; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18054; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18055; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18056; SKIP-CACHE-INV-NEXT: s_endpgm 18057; 18058; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18059; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18060; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18061; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18062; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18063; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18064; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18065; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18066; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18067; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18068; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18069; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18070; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18071; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18072; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18073; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18074; 18075; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18076; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18077; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18078; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18079; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18080; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18081; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18082; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18083; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18084; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18085; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18086; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18087; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18088; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18089; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 18090; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18091; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18092; GFX90A-TGSPLIT-NEXT: s_endpgm 18093; 18094; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18095; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18096; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18097; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18098; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18099; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18100; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18101; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18102; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18103; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18104; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18105; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18106; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18107; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18108; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18109; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18110; 18111; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18112; GFX940-TGSPLIT: ; %bb.0: ; %entry 18113; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18114; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18115; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18116; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18117; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18118; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18119; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18120; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18121; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18122; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18123; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18124; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18125; GFX940-TGSPLIT-NEXT: buffer_inv sc0 18126; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18127; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18128; GFX940-TGSPLIT-NEXT: s_endpgm 18129; 18130; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18131; GFX11-WGP: ; %bb.0: ; %entry 18132; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18133; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18134; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18135; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18136; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18137; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18138; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18139; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18140; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18141; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18142; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18143; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18144; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18145; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18146; GFX11-WGP-NEXT: buffer_gl0_inv 18147; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18148; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18149; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18150; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18151; GFX11-WGP-NEXT: s_endpgm 18152; 18153; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18154; GFX11-CU: ; %bb.0: ; %entry 18155; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18156; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18157; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18158; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18159; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18160; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18161; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18162; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18163; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18164; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18165; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18166; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18167; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18168; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18169; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18170; GFX11-CU-NEXT: s_endpgm 18171; 18172; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18173; GFX12-WGP: ; %bb.0: ; %entry 18174; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18175; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18176; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18177; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18178; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18179; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18180; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18181; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18182; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18183; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18184; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18185; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18186; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18187; GFX12-WGP-NEXT: s_wait_storecnt 0x0 18188; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 18189; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18190; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18191; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18192; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 18193; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18194; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18195; GFX12-WGP-NEXT: s_wait_dscnt 0x0 18196; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18197; GFX12-WGP-NEXT: s_endpgm 18198; 18199; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: 18200; GFX12-CU: ; %bb.0: ; %entry 18201; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18202; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18203; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18204; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18205; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18206; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18207; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18208; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18209; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18210; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18211; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18212; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18213; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18214; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18215; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18216; GFX12-CU-NEXT: s_endpgm 18217 ptr %out, i32 %in, i32 %old) { 18218entry: 18219 %gep = getelementptr i32, ptr %out, i32 4 18220 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 18221 %val0 = extractvalue { i32, i1 } %val, 0 18222 store i32 %val0, ptr %out, align 4 18223 ret void 18224} 18225 18226define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( 18227; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18228; GFX7: ; %bb.0: ; %entry 18229; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18230; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18231; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18232; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18233; GFX7-NEXT: s_mov_b64 s[12:13], 16 18234; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18235; GFX7-NEXT: s_mov_b32 s6, s4 18236; GFX7-NEXT: s_mov_b32 s7, s5 18237; GFX7-NEXT: s_mov_b32 s11, s12 18238; GFX7-NEXT: s_mov_b32 s10, s13 18239; GFX7-NEXT: s_add_u32 s6, s6, s11 18240; GFX7-NEXT: s_addc_u32 s10, s7, s10 18241; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18242; GFX7-NEXT: s_mov_b32 s7, s10 18243; GFX7-NEXT: v_mov_b32_e32 v2, s9 18244; GFX7-NEXT: v_mov_b32_e32 v0, s8 18245; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18246; GFX7-NEXT: v_mov_b32_e32 v3, v0 18247; GFX7-NEXT: v_mov_b32_e32 v0, s6 18248; GFX7-NEXT: v_mov_b32_e32 v1, s7 18249; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18250; GFX7-NEXT: v_mov_b32_e32 v0, s4 18251; GFX7-NEXT: v_mov_b32_e32 v1, s5 18252; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18253; GFX7-NEXT: flat_store_dword v[0:1], v2 18254; GFX7-NEXT: s_endpgm 18255; 18256; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18257; GFX10-WGP: ; %bb.0: ; %entry 18258; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18259; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18260; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18261; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18262; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18263; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18264; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18265; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18266; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18267; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18268; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18269; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18270; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18271; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18272; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18273; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18274; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18275; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18276; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18277; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18278; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18279; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18280; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18281; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18282; GFX10-WGP-NEXT: buffer_gl0_inv 18283; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18284; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18285; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18286; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18287; GFX10-WGP-NEXT: s_endpgm 18288; 18289; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18290; GFX10-CU: ; %bb.0: ; %entry 18291; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18292; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18293; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18294; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18295; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18296; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18297; GFX10-CU-NEXT: s_mov_b32 s6, s4 18298; GFX10-CU-NEXT: s_mov_b32 s7, s5 18299; GFX10-CU-NEXT: s_mov_b32 s11, s12 18300; GFX10-CU-NEXT: s_mov_b32 s10, s13 18301; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18302; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18303; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18304; GFX10-CU-NEXT: s_mov_b32 s7, s10 18305; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18306; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18307; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18308; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18309; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18310; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18311; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18312; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18313; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18314; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18315; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18316; GFX10-CU-NEXT: s_endpgm 18317; 18318; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18319; SKIP-CACHE-INV: ; %bb.0: ; %entry 18320; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18321; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18322; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18323; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18324; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18325; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18326; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18327; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18328; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18329; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18330; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18331; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18332; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18333; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18334; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18335; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18336; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18337; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18338; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18339; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18340; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18341; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18342; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18343; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18344; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18345; SKIP-CACHE-INV-NEXT: s_endpgm 18346; 18347; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18348; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18349; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18350; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18351; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18352; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18353; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18354; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18355; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18356; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18357; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18358; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18359; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18360; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18361; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18362; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18363; 18364; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18365; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18366; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18367; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18368; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18369; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18370; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18371; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18372; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18373; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18374; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18375; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18376; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18377; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18378; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 18379; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18380; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18381; GFX90A-TGSPLIT-NEXT: s_endpgm 18382; 18383; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18384; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18385; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18386; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18387; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18388; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18389; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18390; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18391; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18392; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18393; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18394; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18395; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18396; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18397; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18398; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18399; 18400; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18401; GFX940-TGSPLIT: ; %bb.0: ; %entry 18402; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18403; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18404; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18405; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18406; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18407; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18408; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18409; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18410; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18411; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18412; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18413; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18414; GFX940-TGSPLIT-NEXT: buffer_inv sc0 18415; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18416; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18417; GFX940-TGSPLIT-NEXT: s_endpgm 18418; 18419; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18420; GFX11-WGP: ; %bb.0: ; %entry 18421; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18422; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18423; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18424; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18425; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18426; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18427; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18428; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18429; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18430; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18431; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18432; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18433; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18434; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18435; GFX11-WGP-NEXT: buffer_gl0_inv 18436; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18437; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18438; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18439; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18440; GFX11-WGP-NEXT: s_endpgm 18441; 18442; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18443; GFX11-CU: ; %bb.0: ; %entry 18444; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18445; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18446; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18447; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18448; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18449; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18450; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18451; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18452; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18453; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18454; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18455; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18456; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18457; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18458; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18459; GFX11-CU-NEXT: s_endpgm 18460; 18461; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18462; GFX12-WGP: ; %bb.0: ; %entry 18463; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18464; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18465; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18466; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18467; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18468; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18469; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18470; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18471; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18472; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18473; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18474; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18475; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18476; GFX12-WGP-NEXT: s_wait_storecnt 0x0 18477; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 18478; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18479; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18480; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18481; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 18482; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18483; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18484; GFX12-WGP-NEXT: s_wait_dscnt 0x0 18485; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18486; GFX12-WGP-NEXT: s_endpgm 18487; 18488; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 18489; GFX12-CU: ; %bb.0: ; %entry 18490; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18491; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18492; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18493; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18494; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18495; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18496; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18497; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18498; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18499; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18500; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18501; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18502; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18503; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18504; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18505; GFX12-CU-NEXT: s_endpgm 18506 ptr %out, i32 %in, i32 %old) { 18507entry: 18508 %gep = getelementptr i32, ptr %out, i32 4 18509 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 18510 %val0 = extractvalue { i32, i1 } %val, 0 18511 store i32 %val0, ptr %out, align 4 18512 ret void 18513} 18514 18515define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( 18516; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18517; GFX7: ; %bb.0: ; %entry 18518; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18519; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18520; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18521; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18522; GFX7-NEXT: s_mov_b64 s[12:13], 16 18523; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18524; GFX7-NEXT: s_mov_b32 s6, s4 18525; GFX7-NEXT: s_mov_b32 s7, s5 18526; GFX7-NEXT: s_mov_b32 s11, s12 18527; GFX7-NEXT: s_mov_b32 s10, s13 18528; GFX7-NEXT: s_add_u32 s6, s6, s11 18529; GFX7-NEXT: s_addc_u32 s10, s7, s10 18530; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18531; GFX7-NEXT: s_mov_b32 s7, s10 18532; GFX7-NEXT: v_mov_b32_e32 v2, s9 18533; GFX7-NEXT: v_mov_b32_e32 v0, s8 18534; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18535; GFX7-NEXT: v_mov_b32_e32 v3, v0 18536; GFX7-NEXT: v_mov_b32_e32 v0, s6 18537; GFX7-NEXT: v_mov_b32_e32 v1, s7 18538; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18539; GFX7-NEXT: v_mov_b32_e32 v0, s4 18540; GFX7-NEXT: v_mov_b32_e32 v1, s5 18541; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18542; GFX7-NEXT: flat_store_dword v[0:1], v2 18543; GFX7-NEXT: s_endpgm 18544; 18545; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18546; GFX10-WGP: ; %bb.0: ; %entry 18547; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18548; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18549; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18550; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18551; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18552; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18553; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18554; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18555; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18556; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18557; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18558; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18559; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18560; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18561; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18562; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18563; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18564; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18565; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18566; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18567; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18568; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18569; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18570; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18571; GFX10-WGP-NEXT: buffer_gl0_inv 18572; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18573; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18574; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18575; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18576; GFX10-WGP-NEXT: s_endpgm 18577; 18578; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18579; GFX10-CU: ; %bb.0: ; %entry 18580; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18581; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18582; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18583; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18584; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18585; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18586; GFX10-CU-NEXT: s_mov_b32 s6, s4 18587; GFX10-CU-NEXT: s_mov_b32 s7, s5 18588; GFX10-CU-NEXT: s_mov_b32 s11, s12 18589; GFX10-CU-NEXT: s_mov_b32 s10, s13 18590; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18591; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18592; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18593; GFX10-CU-NEXT: s_mov_b32 s7, s10 18594; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18595; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18596; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18597; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18598; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18599; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18600; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18601; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18602; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18603; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18604; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18605; GFX10-CU-NEXT: s_endpgm 18606; 18607; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18608; SKIP-CACHE-INV: ; %bb.0: ; %entry 18609; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18610; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18611; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18612; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18613; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18614; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18615; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18616; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18617; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18618; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18619; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18620; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18621; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18622; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18623; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18624; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18625; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18626; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18627; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18628; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18629; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18630; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18631; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18632; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18633; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18634; SKIP-CACHE-INV-NEXT: s_endpgm 18635; 18636; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18637; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18638; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18639; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18640; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18641; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18642; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18643; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18644; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18645; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18646; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18647; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18648; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18649; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18650; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18651; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18652; 18653; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18654; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18655; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18656; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18657; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18658; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18659; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18660; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18661; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18662; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18663; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18664; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18665; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18666; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18667; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 18668; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18669; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18670; GFX90A-TGSPLIT-NEXT: s_endpgm 18671; 18672; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18673; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18674; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18675; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18676; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18677; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18678; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18679; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18680; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18681; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18682; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18683; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18684; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18685; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18686; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18687; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18688; 18689; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18690; GFX940-TGSPLIT: ; %bb.0: ; %entry 18691; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18692; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18693; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18694; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18695; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18696; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18697; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18698; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18699; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18700; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18701; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18702; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18703; GFX940-TGSPLIT-NEXT: buffer_inv sc0 18704; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18705; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18706; GFX940-TGSPLIT-NEXT: s_endpgm 18707; 18708; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18709; GFX11-WGP: ; %bb.0: ; %entry 18710; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18711; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18712; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18713; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18714; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18715; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18716; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18717; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18718; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18719; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18720; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18721; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18722; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18723; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 18724; GFX11-WGP-NEXT: buffer_gl0_inv 18725; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18726; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18727; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18728; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18729; GFX11-WGP-NEXT: s_endpgm 18730; 18731; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18732; GFX11-CU: ; %bb.0: ; %entry 18733; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18734; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18735; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18736; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18737; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18738; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18739; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18740; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18741; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18742; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18743; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18744; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18745; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18746; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18747; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18748; GFX11-CU-NEXT: s_endpgm 18749; 18750; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18751; GFX12-WGP: ; %bb.0: ; %entry 18752; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18753; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18754; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18755; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18756; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18757; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18758; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18759; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18760; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18761; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18762; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18763; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18764; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18765; GFX12-WGP-NEXT: s_wait_storecnt 0x0 18766; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 18767; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 18768; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 18769; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 18770; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 18771; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18772; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18773; GFX12-WGP-NEXT: s_wait_dscnt 0x0 18774; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18775; GFX12-WGP-NEXT: s_endpgm 18776; 18777; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 18778; GFX12-CU: ; %bb.0: ; %entry 18779; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18780; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18781; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18782; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18783; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18784; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18785; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18786; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18787; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18788; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18789; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18790; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18791; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18792; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18793; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18794; GFX12-CU-NEXT: s_endpgm 18795 ptr %out, i32 %in, i32 %old) { 18796entry: 18797 %gep = getelementptr i32, ptr %out, i32 4 18798 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 18799 %val0 = extractvalue { i32, i1 } %val, 0 18800 store i32 %val0, ptr %out, align 4 18801 ret void 18802} 18803 18804define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( 18805; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18806; GFX7: ; %bb.0: ; %entry 18807; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18808; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18809; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18810; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18811; GFX7-NEXT: s_mov_b64 s[12:13], 16 18812; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18813; GFX7-NEXT: s_mov_b32 s6, s4 18814; GFX7-NEXT: s_mov_b32 s7, s5 18815; GFX7-NEXT: s_mov_b32 s11, s12 18816; GFX7-NEXT: s_mov_b32 s10, s13 18817; GFX7-NEXT: s_add_u32 s6, s6, s11 18818; GFX7-NEXT: s_addc_u32 s10, s7, s10 18819; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18820; GFX7-NEXT: s_mov_b32 s7, s10 18821; GFX7-NEXT: v_mov_b32_e32 v2, s9 18822; GFX7-NEXT: v_mov_b32_e32 v0, s8 18823; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18824; GFX7-NEXT: v_mov_b32_e32 v3, v0 18825; GFX7-NEXT: v_mov_b32_e32 v0, s6 18826; GFX7-NEXT: v_mov_b32_e32 v1, s7 18827; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18828; GFX7-NEXT: v_mov_b32_e32 v0, s4 18829; GFX7-NEXT: v_mov_b32_e32 v1, s5 18830; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18831; GFX7-NEXT: flat_store_dword v[0:1], v2 18832; GFX7-NEXT: s_endpgm 18833; 18834; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18835; GFX10-WGP: ; %bb.0: ; %entry 18836; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18837; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18838; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18839; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18840; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18841; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18842; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18843; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18844; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18845; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18846; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18847; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18848; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18849; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18850; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18851; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18852; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18853; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18854; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18855; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18856; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18857; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 18858; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18859; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 18860; GFX10-WGP-NEXT: buffer_gl0_inv 18861; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18862; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18863; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18864; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18865; GFX10-WGP-NEXT: s_endpgm 18866; 18867; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18868; GFX10-CU: ; %bb.0: ; %entry 18869; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18870; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18871; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18872; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18873; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18874; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18875; GFX10-CU-NEXT: s_mov_b32 s6, s4 18876; GFX10-CU-NEXT: s_mov_b32 s7, s5 18877; GFX10-CU-NEXT: s_mov_b32 s11, s12 18878; GFX10-CU-NEXT: s_mov_b32 s10, s13 18879; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18880; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18881; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18882; GFX10-CU-NEXT: s_mov_b32 s7, s10 18883; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18884; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18885; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18886; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18887; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18888; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18889; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18890; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18891; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18892; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18893; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18894; GFX10-CU-NEXT: s_endpgm 18895; 18896; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18897; SKIP-CACHE-INV: ; %bb.0: ; %entry 18898; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18899; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18900; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18901; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18902; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18903; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18904; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18905; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18906; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18907; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18908; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18909; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18910; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18911; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18913; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18914; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18918; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18919; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18920; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18921; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18922; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18923; SKIP-CACHE-INV-NEXT: s_endpgm 18924; 18925; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18926; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18927; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18928; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18929; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18930; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18931; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18932; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18933; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18934; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18935; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18936; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18937; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18938; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18939; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18940; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18941; 18942; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18943; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18944; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18945; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18946; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18947; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18948; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18949; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18950; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18951; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18952; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18953; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18954; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18955; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18956; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 18957; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18958; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18959; GFX90A-TGSPLIT-NEXT: s_endpgm 18960; 18961; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18962; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18963; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18964; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18965; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18966; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18967; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18968; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18969; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18970; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18971; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18972; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18973; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18974; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18975; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18976; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18977; 18978; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18979; GFX940-TGSPLIT: ; %bb.0: ; %entry 18980; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18981; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18982; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18983; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18984; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18985; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18986; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18987; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18988; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18989; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18990; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18991; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18992; GFX940-TGSPLIT-NEXT: buffer_inv sc0 18993; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18994; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18995; GFX940-TGSPLIT-NEXT: s_endpgm 18996; 18997; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 18998; GFX11-WGP: ; %bb.0: ; %entry 18999; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19000; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19001; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19002; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19003; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19004; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19005; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19006; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19007; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19008; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19009; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19010; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19011; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19012; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19013; GFX11-WGP-NEXT: buffer_gl0_inv 19014; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19015; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19016; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19017; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19018; GFX11-WGP-NEXT: s_endpgm 19019; 19020; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 19021; GFX11-CU: ; %bb.0: ; %entry 19022; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19023; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19024; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19025; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19026; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19027; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19028; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19029; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19030; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19031; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19032; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19033; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19034; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19035; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19036; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19037; GFX11-CU-NEXT: s_endpgm 19038; 19039; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 19040; GFX12-WGP: ; %bb.0: ; %entry 19041; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19042; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19043; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19044; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19045; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19046; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19047; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19048; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19049; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19050; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19051; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19052; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19053; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19054; GFX12-WGP-NEXT: s_wait_storecnt 0x0 19055; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 19056; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19057; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19058; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19059; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 19060; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19061; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19062; GFX12-WGP-NEXT: s_wait_dscnt 0x0 19063; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19064; GFX12-WGP-NEXT: s_endpgm 19065; 19066; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: 19067; GFX12-CU: ; %bb.0: ; %entry 19068; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19069; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19070; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19071; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19072; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19073; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19074; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19075; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19076; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19077; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19078; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19079; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19080; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19081; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19082; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19083; GFX12-CU-NEXT: s_endpgm 19084 ptr %out, i32 %in, i32 %old) { 19085entry: 19086 %gep = getelementptr i32, ptr %out, i32 4 19087 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst 19088 %val0 = extractvalue { i32, i1 } %val, 0 19089 store i32 %val0, ptr %out, align 4 19090 ret void 19091} 19092 19093define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( 19094; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19095; GFX7: ; %bb.0: ; %entry 19096; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19097; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19098; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19099; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19100; GFX7-NEXT: s_mov_b64 s[12:13], 16 19101; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19102; GFX7-NEXT: s_mov_b32 s6, s4 19103; GFX7-NEXT: s_mov_b32 s7, s5 19104; GFX7-NEXT: s_mov_b32 s11, s12 19105; GFX7-NEXT: s_mov_b32 s10, s13 19106; GFX7-NEXT: s_add_u32 s6, s6, s11 19107; GFX7-NEXT: s_addc_u32 s10, s7, s10 19108; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19109; GFX7-NEXT: s_mov_b32 s7, s10 19110; GFX7-NEXT: v_mov_b32_e32 v2, s9 19111; GFX7-NEXT: v_mov_b32_e32 v0, s8 19112; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19113; GFX7-NEXT: v_mov_b32_e32 v3, v0 19114; GFX7-NEXT: v_mov_b32_e32 v0, s6 19115; GFX7-NEXT: v_mov_b32_e32 v1, s7 19116; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19117; GFX7-NEXT: v_mov_b32_e32 v0, s4 19118; GFX7-NEXT: v_mov_b32_e32 v1, s5 19119; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19120; GFX7-NEXT: flat_store_dword v[0:1], v2 19121; GFX7-NEXT: s_endpgm 19122; 19123; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19124; GFX10-WGP: ; %bb.0: ; %entry 19125; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19126; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19127; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19128; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19129; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19130; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19131; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19132; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19133; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19134; GFX10-WGP-NEXT: s_mov_b32 s10, s13 19135; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 19136; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 19137; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19138; GFX10-WGP-NEXT: s_mov_b32 s7, s10 19139; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 19140; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 19141; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19142; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 19143; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 19144; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 19145; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19146; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19147; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19148; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19149; GFX10-WGP-NEXT: buffer_gl0_inv 19150; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 19151; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 19152; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19153; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 19154; GFX10-WGP-NEXT: s_endpgm 19155; 19156; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19157; GFX10-CU: ; %bb.0: ; %entry 19158; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 19159; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19160; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 19161; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 19162; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 19163; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 19164; GFX10-CU-NEXT: s_mov_b32 s6, s4 19165; GFX10-CU-NEXT: s_mov_b32 s7, s5 19166; GFX10-CU-NEXT: s_mov_b32 s11, s12 19167; GFX10-CU-NEXT: s_mov_b32 s10, s13 19168; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 19169; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 19170; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19171; GFX10-CU-NEXT: s_mov_b32 s7, s10 19172; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 19173; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 19174; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19175; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 19176; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 19177; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 19178; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19179; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 19180; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 19181; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19182; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 19183; GFX10-CU-NEXT: s_endpgm 19184; 19185; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19186; SKIP-CACHE-INV: ; %bb.0: ; %entry 19187; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 19188; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 19189; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 19190; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 19191; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 19192; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 19193; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 19194; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 19195; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 19196; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 19197; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 19198; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 19199; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 19200; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 19201; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 19202; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 19203; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 19205; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 19206; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 19207; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19208; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 19209; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 19210; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19211; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 19212; SKIP-CACHE-INV-NEXT: s_endpgm 19213; 19214; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19215; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 19216; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19217; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19218; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19219; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19220; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19221; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19222; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19223; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19224; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19225; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19226; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19227; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19228; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 19229; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 19230; 19231; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19232; GFX90A-TGSPLIT: ; %bb.0: ; %entry 19233; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19234; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19235; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19236; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19237; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19238; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19239; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19240; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19241; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19242; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19243; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19244; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19245; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 19246; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19247; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 19248; GFX90A-TGSPLIT-NEXT: s_endpgm 19249; 19250; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19251; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 19252; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19253; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19254; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19255; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19256; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19257; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19258; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19259; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19260; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19261; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19262; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19263; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19264; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19265; GFX940-NOTTGSPLIT-NEXT: s_endpgm 19266; 19267; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19268; GFX940-TGSPLIT: ; %bb.0: ; %entry 19269; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19270; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19271; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19272; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19273; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19274; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19275; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19276; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19277; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19278; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19279; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19280; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19281; GFX940-TGSPLIT-NEXT: buffer_inv sc0 19282; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19283; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19284; GFX940-TGSPLIT-NEXT: s_endpgm 19285; 19286; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19287; GFX11-WGP: ; %bb.0: ; %entry 19288; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19289; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19290; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19291; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19292; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19293; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19294; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19295; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19296; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19297; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19298; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19299; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19300; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19301; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19302; GFX11-WGP-NEXT: buffer_gl0_inv 19303; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19304; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19305; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19306; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19307; GFX11-WGP-NEXT: s_endpgm 19308; 19309; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19310; GFX11-CU: ; %bb.0: ; %entry 19311; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19312; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19313; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19314; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19315; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19316; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19317; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19318; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19319; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19320; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19321; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19322; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19323; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19324; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19325; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19326; GFX11-CU-NEXT: s_endpgm 19327; 19328; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19329; GFX12-WGP: ; %bb.0: ; %entry 19330; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19331; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19332; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19333; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19334; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19335; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19336; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19337; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19338; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19339; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19340; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19341; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19342; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19343; GFX12-WGP-NEXT: s_wait_storecnt 0x0 19344; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 19345; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19346; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 19347; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19348; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19349; GFX12-WGP-NEXT: s_wait_dscnt 0x0 19350; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19351; GFX12-WGP-NEXT: s_endpgm 19352; 19353; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: 19354; GFX12-CU: ; %bb.0: ; %entry 19355; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19356; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19357; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19358; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19359; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19360; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19361; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19362; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19363; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19364; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19365; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19366; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19367; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19368; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19369; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19370; GFX12-CU-NEXT: s_endpgm 19371 ptr %out, i32 %in, i32 %old) { 19372entry: 19373 %gep = getelementptr i32, ptr %out, i32 4 19374 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst 19375 %val0 = extractvalue { i32, i1 } %val, 0 19376 store i32 %val0, ptr %out, align 4 19377 ret void 19378} 19379 19380define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( 19381; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19382; GFX7: ; %bb.0: ; %entry 19383; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19384; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19385; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19386; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19387; GFX7-NEXT: s_mov_b64 s[12:13], 16 19388; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19389; GFX7-NEXT: s_mov_b32 s6, s4 19390; GFX7-NEXT: s_mov_b32 s7, s5 19391; GFX7-NEXT: s_mov_b32 s11, s12 19392; GFX7-NEXT: s_mov_b32 s10, s13 19393; GFX7-NEXT: s_add_u32 s6, s6, s11 19394; GFX7-NEXT: s_addc_u32 s10, s7, s10 19395; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19396; GFX7-NEXT: s_mov_b32 s7, s10 19397; GFX7-NEXT: v_mov_b32_e32 v2, s9 19398; GFX7-NEXT: v_mov_b32_e32 v0, s8 19399; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19400; GFX7-NEXT: v_mov_b32_e32 v3, v0 19401; GFX7-NEXT: v_mov_b32_e32 v0, s6 19402; GFX7-NEXT: v_mov_b32_e32 v1, s7 19403; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19404; GFX7-NEXT: v_mov_b32_e32 v0, s4 19405; GFX7-NEXT: v_mov_b32_e32 v1, s5 19406; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19407; GFX7-NEXT: flat_store_dword v[0:1], v2 19408; GFX7-NEXT: s_endpgm 19409; 19410; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19411; GFX10-WGP: ; %bb.0: ; %entry 19412; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19413; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19414; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19415; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19416; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19417; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19418; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19419; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19420; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19421; GFX10-WGP-NEXT: s_mov_b32 s10, s13 19422; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 19423; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 19424; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19425; GFX10-WGP-NEXT: s_mov_b32 s7, s10 19426; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 19427; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 19428; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19429; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 19430; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 19431; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 19432; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19433; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19434; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19435; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19436; GFX10-WGP-NEXT: buffer_gl0_inv 19437; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 19438; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 19439; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19440; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 19441; GFX10-WGP-NEXT: s_endpgm 19442; 19443; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19444; GFX10-CU: ; %bb.0: ; %entry 19445; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 19446; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19447; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 19448; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 19449; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 19450; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 19451; GFX10-CU-NEXT: s_mov_b32 s6, s4 19452; GFX10-CU-NEXT: s_mov_b32 s7, s5 19453; GFX10-CU-NEXT: s_mov_b32 s11, s12 19454; GFX10-CU-NEXT: s_mov_b32 s10, s13 19455; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 19456; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 19457; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19458; GFX10-CU-NEXT: s_mov_b32 s7, s10 19459; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 19460; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 19461; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19462; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 19463; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 19464; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 19465; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19466; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 19467; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 19468; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19469; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 19470; GFX10-CU-NEXT: s_endpgm 19471; 19472; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19473; SKIP-CACHE-INV: ; %bb.0: ; %entry 19474; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 19475; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 19476; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 19477; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 19478; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 19479; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 19480; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 19481; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 19482; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 19483; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 19484; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 19485; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 19486; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 19487; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 19488; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 19489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 19490; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19491; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 19492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 19493; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 19494; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19495; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 19496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 19497; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19498; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 19499; SKIP-CACHE-INV-NEXT: s_endpgm 19500; 19501; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19502; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 19503; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19504; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19505; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19506; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19507; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19508; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19509; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19510; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19511; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19512; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19513; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19514; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19515; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 19516; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 19517; 19518; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19519; GFX90A-TGSPLIT: ; %bb.0: ; %entry 19520; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19521; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19522; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19523; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19524; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19525; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19526; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19527; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19528; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19529; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19530; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19531; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19532; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 19533; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19534; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 19535; GFX90A-TGSPLIT-NEXT: s_endpgm 19536; 19537; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19538; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 19539; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19540; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19541; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19542; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19543; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19544; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19545; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19546; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19547; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19548; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19549; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19550; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19551; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19552; GFX940-NOTTGSPLIT-NEXT: s_endpgm 19553; 19554; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19555; GFX940-TGSPLIT: ; %bb.0: ; %entry 19556; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19557; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19558; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19559; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19560; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19561; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19562; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19563; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19564; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19565; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19566; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19567; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19568; GFX940-TGSPLIT-NEXT: buffer_inv sc0 19569; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19570; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19571; GFX940-TGSPLIT-NEXT: s_endpgm 19572; 19573; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19574; GFX11-WGP: ; %bb.0: ; %entry 19575; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19576; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19577; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19578; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19579; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19580; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19581; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19582; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19583; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19584; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19585; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19586; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19587; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19588; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19589; GFX11-WGP-NEXT: buffer_gl0_inv 19590; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19591; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19592; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19593; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19594; GFX11-WGP-NEXT: s_endpgm 19595; 19596; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19597; GFX11-CU: ; %bb.0: ; %entry 19598; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19599; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19600; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19601; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19602; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19603; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19604; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19605; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19606; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19607; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19608; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19609; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19610; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19611; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19612; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19613; GFX11-CU-NEXT: s_endpgm 19614; 19615; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19616; GFX12-WGP: ; %bb.0: ; %entry 19617; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19618; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19619; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19620; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19621; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19622; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19623; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19624; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19625; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19626; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19627; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19628; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19629; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19630; GFX12-WGP-NEXT: s_wait_storecnt 0x0 19631; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 19632; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19633; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19634; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19635; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 19636; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19637; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19638; GFX12-WGP-NEXT: s_wait_dscnt 0x0 19639; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19640; GFX12-WGP-NEXT: s_endpgm 19641; 19642; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: 19643; GFX12-CU: ; %bb.0: ; %entry 19644; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19645; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19646; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19647; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19648; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19649; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19650; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19651; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19652; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19653; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19654; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19655; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19656; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19657; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19658; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19659; GFX12-CU-NEXT: s_endpgm 19660 ptr %out, i32 %in, i32 %old) { 19661entry: 19662 %gep = getelementptr i32, ptr %out, i32 4 19663 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst 19664 %val0 = extractvalue { i32, i1 } %val, 0 19665 store i32 %val0, ptr %out, align 4 19666 ret void 19667} 19668 19669define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( 19670; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19671; GFX7: ; %bb.0: ; %entry 19672; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19673; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19674; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19675; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19676; GFX7-NEXT: s_mov_b64 s[12:13], 16 19677; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19678; GFX7-NEXT: s_mov_b32 s6, s4 19679; GFX7-NEXT: s_mov_b32 s7, s5 19680; GFX7-NEXT: s_mov_b32 s11, s12 19681; GFX7-NEXT: s_mov_b32 s10, s13 19682; GFX7-NEXT: s_add_u32 s6, s6, s11 19683; GFX7-NEXT: s_addc_u32 s10, s7, s10 19684; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19685; GFX7-NEXT: s_mov_b32 s7, s10 19686; GFX7-NEXT: v_mov_b32_e32 v2, s9 19687; GFX7-NEXT: v_mov_b32_e32 v0, s8 19688; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19689; GFX7-NEXT: v_mov_b32_e32 v3, v0 19690; GFX7-NEXT: v_mov_b32_e32 v0, s6 19691; GFX7-NEXT: v_mov_b32_e32 v1, s7 19692; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19693; GFX7-NEXT: v_mov_b32_e32 v0, s4 19694; GFX7-NEXT: v_mov_b32_e32 v1, s5 19695; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19696; GFX7-NEXT: flat_store_dword v[0:1], v2 19697; GFX7-NEXT: s_endpgm 19698; 19699; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19700; GFX10-WGP: ; %bb.0: ; %entry 19701; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19702; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19703; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19704; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19705; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19706; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19707; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19708; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19709; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19710; GFX10-WGP-NEXT: s_mov_b32 s10, s13 19711; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 19712; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 19713; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19714; GFX10-WGP-NEXT: s_mov_b32 s7, s10 19715; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 19716; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 19717; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19718; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 19719; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 19720; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 19721; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19722; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19723; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19724; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 19725; GFX10-WGP-NEXT: buffer_gl0_inv 19726; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 19727; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 19728; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19729; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 19730; GFX10-WGP-NEXT: s_endpgm 19731; 19732; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19733; GFX10-CU: ; %bb.0: ; %entry 19734; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 19735; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19736; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 19737; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 19738; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 19739; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 19740; GFX10-CU-NEXT: s_mov_b32 s6, s4 19741; GFX10-CU-NEXT: s_mov_b32 s7, s5 19742; GFX10-CU-NEXT: s_mov_b32 s11, s12 19743; GFX10-CU-NEXT: s_mov_b32 s10, s13 19744; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 19745; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 19746; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19747; GFX10-CU-NEXT: s_mov_b32 s7, s10 19748; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 19749; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 19750; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19751; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 19752; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 19753; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 19754; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19755; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 19756; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 19757; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19758; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 19759; GFX10-CU-NEXT: s_endpgm 19760; 19761; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19762; SKIP-CACHE-INV: ; %bb.0: ; %entry 19763; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 19764; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 19765; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 19766; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 19767; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 19768; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 19769; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 19770; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 19771; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 19772; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 19773; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 19774; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 19775; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 19776; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 19777; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 19778; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 19779; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19780; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 19781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 19782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 19783; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19784; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 19785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 19786; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19787; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 19788; SKIP-CACHE-INV-NEXT: s_endpgm 19789; 19790; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19791; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 19792; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19793; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19794; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19795; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19796; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19797; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19798; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19799; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19800; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19801; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19802; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19803; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19804; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 19805; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 19806; 19807; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19808; GFX90A-TGSPLIT: ; %bb.0: ; %entry 19809; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19810; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19811; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19812; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19813; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19814; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19815; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19816; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19817; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19818; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19819; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19820; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19821; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 19822; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19823; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 19824; GFX90A-TGSPLIT-NEXT: s_endpgm 19825; 19826; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19827; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 19828; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19829; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19830; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19831; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19832; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19833; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19834; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19835; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19836; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19837; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19838; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19839; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19840; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19841; GFX940-NOTTGSPLIT-NEXT: s_endpgm 19842; 19843; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19844; GFX940-TGSPLIT: ; %bb.0: ; %entry 19845; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19846; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19847; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19848; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19849; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19850; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19851; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19852; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19853; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19854; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19855; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19856; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19857; GFX940-TGSPLIT-NEXT: buffer_inv sc0 19858; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19859; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19860; GFX940-TGSPLIT-NEXT: s_endpgm 19861; 19862; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19863; GFX11-WGP: ; %bb.0: ; %entry 19864; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19865; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19866; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19867; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19868; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19869; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19870; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19871; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19872; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19873; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19874; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19875; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 19876; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19877; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 19878; GFX11-WGP-NEXT: buffer_gl0_inv 19879; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19880; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19881; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19882; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19883; GFX11-WGP-NEXT: s_endpgm 19884; 19885; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19886; GFX11-CU: ; %bb.0: ; %entry 19887; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19888; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19889; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19890; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19891; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19892; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19893; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19894; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19895; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19896; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19897; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19898; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19899; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19900; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19901; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19902; GFX11-CU-NEXT: s_endpgm 19903; 19904; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19905; GFX12-WGP: ; %bb.0: ; %entry 19906; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19907; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19908; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19909; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19910; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19911; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19912; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19913; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19914; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19915; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19916; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19917; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19918; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19919; GFX12-WGP-NEXT: s_wait_storecnt 0x0 19920; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 19921; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 19922; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 19923; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 19924; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 19925; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19926; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19927; GFX12-WGP-NEXT: s_wait_dscnt 0x0 19928; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19929; GFX12-WGP-NEXT: s_endpgm 19930; 19931; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: 19932; GFX12-CU: ; %bb.0: ; %entry 19933; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19934; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19935; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19936; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19937; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19938; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19939; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19940; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19941; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19942; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19943; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19944; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19945; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19946; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19947; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19948; GFX12-CU-NEXT: s_endpgm 19949 ptr %out, i32 %in, i32 %old) { 19950entry: 19951 %gep = getelementptr i32, ptr %out, i32 4 19952 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst 19953 %val0 = extractvalue { i32, i1 } %val, 0 19954 store i32 %val0, ptr %out, align 4 19955 ret void 19956} 19957 19958define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( 19959; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 19960; GFX7: ; %bb.0: ; %entry 19961; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19962; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19963; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19964; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19965; GFX7-NEXT: s_mov_b64 s[12:13], 16 19966; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19967; GFX7-NEXT: s_mov_b32 s6, s4 19968; GFX7-NEXT: s_mov_b32 s7, s5 19969; GFX7-NEXT: s_mov_b32 s11, s12 19970; GFX7-NEXT: s_mov_b32 s10, s13 19971; GFX7-NEXT: s_add_u32 s6, s6, s11 19972; GFX7-NEXT: s_addc_u32 s10, s7, s10 19973; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19974; GFX7-NEXT: s_mov_b32 s7, s10 19975; GFX7-NEXT: v_mov_b32_e32 v2, s9 19976; GFX7-NEXT: v_mov_b32_e32 v0, s8 19977; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19978; GFX7-NEXT: v_mov_b32_e32 v3, v0 19979; GFX7-NEXT: v_mov_b32_e32 v0, s6 19980; GFX7-NEXT: v_mov_b32_e32 v1, s7 19981; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19982; GFX7-NEXT: v_mov_b32_e32 v0, s4 19983; GFX7-NEXT: v_mov_b32_e32 v1, s5 19984; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19985; GFX7-NEXT: flat_store_dword v[0:1], v2 19986; GFX7-NEXT: s_endpgm 19987; 19988; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 19989; GFX10-WGP: ; %bb.0: ; %entry 19990; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19991; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19992; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19993; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19994; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19995; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19996; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19997; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19998; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19999; GFX10-WGP-NEXT: s_mov_b32 s10, s13 20000; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 20001; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 20002; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 20003; GFX10-WGP-NEXT: s_mov_b32 s7, s10 20004; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 20005; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 20006; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20007; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 20008; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 20009; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 20010; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 20011; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 20012; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 20013; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 20014; GFX10-WGP-NEXT: buffer_gl0_inv 20015; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 20016; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 20017; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 20018; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 20019; GFX10-WGP-NEXT: s_endpgm 20020; 20021; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20022; GFX10-CU: ; %bb.0: ; %entry 20023; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 20024; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 20025; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 20026; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 20027; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 20028; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 20029; GFX10-CU-NEXT: s_mov_b32 s6, s4 20030; GFX10-CU-NEXT: s_mov_b32 s7, s5 20031; GFX10-CU-NEXT: s_mov_b32 s11, s12 20032; GFX10-CU-NEXT: s_mov_b32 s10, s13 20033; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 20034; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 20035; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 20036; GFX10-CU-NEXT: s_mov_b32 s7, s10 20037; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 20038; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 20039; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20040; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 20041; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 20042; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 20043; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 20044; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 20045; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 20046; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20047; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 20048; GFX10-CU-NEXT: s_endpgm 20049; 20050; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20051; SKIP-CACHE-INV: ; %bb.0: ; %entry 20052; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 20053; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 20054; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 20055; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 20056; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 20057; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 20058; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 20059; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 20060; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 20061; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 20062; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 20063; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 20064; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 20065; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 20066; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 20067; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 20068; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20069; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 20070; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 20071; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 20072; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 20073; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 20074; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 20075; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20076; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 20077; SKIP-CACHE-INV-NEXT: s_endpgm 20078; 20079; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20080; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 20081; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 20082; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 20083; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 20084; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 20085; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 20086; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 20087; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20088; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 20089; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 20090; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 20091; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 20092; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20093; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 20094; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 20095; 20096; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20097; GFX90A-TGSPLIT: ; %bb.0: ; %entry 20098; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 20099; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 20100; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 20101; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 20102; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 20103; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 20104; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20105; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 20106; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 20107; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 20108; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 20109; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 20110; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol 20111; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 20112; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 20113; GFX90A-TGSPLIT-NEXT: s_endpgm 20114; 20115; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20116; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 20117; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 20118; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 20119; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 20120; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 20121; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 20122; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 20123; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20124; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 20125; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 20126; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 20127; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 20128; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20129; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 20130; GFX940-NOTTGSPLIT-NEXT: s_endpgm 20131; 20132; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20133; GFX940-TGSPLIT: ; %bb.0: ; %entry 20134; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 20135; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 20136; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 20137; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 20138; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 20139; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 20140; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20141; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 20142; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 20143; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 20144; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 20145; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 20146; GFX940-TGSPLIT-NEXT: buffer_inv sc0 20147; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 20148; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 20149; GFX940-TGSPLIT-NEXT: s_endpgm 20150; 20151; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20152; GFX11-WGP: ; %bb.0: ; %entry 20153; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 20154; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 20155; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 20156; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 20157; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 20158; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 20159; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20160; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 20161; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 20162; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 20163; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 20164; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 20165; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 20166; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 20167; GFX11-WGP-NEXT: buffer_gl0_inv 20168; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 20169; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 20170; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 20171; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 20172; GFX11-WGP-NEXT: s_endpgm 20173; 20174; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20175; GFX11-CU: ; %bb.0: ; %entry 20176; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 20177; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 20178; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 20179; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 20180; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 20181; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 20182; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20183; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 20184; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 20185; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 20186; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 20187; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 20188; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 20189; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 20190; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 20191; GFX11-CU-NEXT: s_endpgm 20192; 20193; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20194; GFX12-WGP: ; %bb.0: ; %entry 20195; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 20196; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 20197; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 20198; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 20199; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 20200; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 20201; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20202; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 20203; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 20204; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 20205; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 20206; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 20207; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 20208; GFX12-WGP-NEXT: s_wait_storecnt 0x0 20209; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE 20210; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 20211; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 20212; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 20213; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE 20214; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 20215; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 20216; GFX12-WGP-NEXT: s_wait_dscnt 0x0 20217; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 20218; GFX12-WGP-NEXT: s_endpgm 20219; 20220; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 20221; GFX12-CU: ; %bb.0: ; %entry 20222; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 20223; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 20224; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 20225; GFX12-CU-NEXT: s_wait_kmcnt 0x0 20226; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 20227; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 20228; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 20229; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 20230; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 20231; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 20232; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 20233; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 20234; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 20235; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 20236; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 20237; GFX12-CU-NEXT: s_endpgm 20238 ptr %out, i32 %in, i32 %old) { 20239entry: 20240 %gep = getelementptr i32, ptr %out, i32 4 20241 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 20242 %val0 = extractvalue { i32, i1 } %val, 0 20243 store i32 %val0, ptr %out, align 4 20244 ret void 20245} 20246