1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s 12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s 13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s 14 15define amdgpu_kernel void @flat_singlethread_unordered_load( 16; GFX7-LABEL: flat_singlethread_unordered_load: 17; GFX7: ; %bb.0: ; %entry 18; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 19; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: v_mov_b32_e32 v0, s6 22; GFX7-NEXT: v_mov_b32_e32 v1, s7 23; GFX7-NEXT: flat_load_dword v2, v[0:1] 24; GFX7-NEXT: v_mov_b32_e32 v0, s4 25; GFX7-NEXT: v_mov_b32_e32 v1, s5 26; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 27; GFX7-NEXT: flat_store_dword v[0:1], v2 28; GFX7-NEXT: s_endpgm 29; 30; GFX10-WGP-LABEL: flat_singlethread_unordered_load: 31; GFX10-WGP: ; %bb.0: ; %entry 32; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 33; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 34; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 35; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 36; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 37; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 38; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 39; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 40; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 41; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 42; GFX10-WGP-NEXT: s_endpgm 43; 44; GFX10-CU-LABEL: flat_singlethread_unordered_load: 45; GFX10-CU: ; %bb.0: ; %entry 46; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 47; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 48; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 49; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 50; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 51; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 52; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 53; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 54; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 55; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 56; GFX10-CU-NEXT: s_endpgm 57; 58; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load: 59; SKIP-CACHE-INV: ; %bb.0: ; %entry 60; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 61; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 62; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 63; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 64; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 65; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 66; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 67; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 68; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 69; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 70; SKIP-CACHE-INV-NEXT: s_endpgm 71; 72; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: 73; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 74; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 75; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 76; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 77; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 78; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 79; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 80; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 81; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 82; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 83; 84; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: 85; GFX90A-TGSPLIT: ; %bb.0: ; %entry 86; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 87; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 88; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 89; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 90; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 91; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 92; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 93; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 94; GFX90A-TGSPLIT-NEXT: s_endpgm 95; 96; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: 97; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 98; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 99; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 100; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 101; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 102; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 103; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 104; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 105; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 106; GFX940-NOTTGSPLIT-NEXT: s_endpgm 107; 108; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load: 109; GFX940-TGSPLIT: ; %bb.0: ; %entry 110; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 111; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 112; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 113; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 114; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 115; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 116; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 117; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 118; GFX940-TGSPLIT-NEXT: s_endpgm 119; 120; GFX11-WGP-LABEL: flat_singlethread_unordered_load: 121; GFX11-WGP: ; %bb.0: ; %entry 122; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 123; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 124; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 125; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 126; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 127; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 128; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 129; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 130; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 131; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 132; GFX11-WGP-NEXT: s_endpgm 133; 134; GFX11-CU-LABEL: flat_singlethread_unordered_load: 135; GFX11-CU: ; %bb.0: ; %entry 136; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 137; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 138; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 139; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 140; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 141; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 142; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 143; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 144; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 145; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 146; GFX11-CU-NEXT: s_endpgm 147; 148; GFX12-WGP-LABEL: flat_singlethread_unordered_load: 149; GFX12-WGP: ; %bb.0: ; %entry 150; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 151; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 152; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 153; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 154; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 155; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 156; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 157; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 158; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 159; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 160; GFX12-WGP-NEXT: s_endpgm 161; 162; GFX12-CU-LABEL: flat_singlethread_unordered_load: 163; GFX12-CU: ; %bb.0: ; %entry 164; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 165; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 166; GFX12-CU-NEXT: s_wait_kmcnt 0x0 167; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 168; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 169; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 170; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 171; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 172; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 173; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 174; GFX12-CU-NEXT: s_endpgm 175 ptr %in, ptr %out) { 176entry: 177 %val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4 178 store i32 %val, ptr %out 179 ret void 180} 181 182define amdgpu_kernel void @flat_singlethread_monotonic_load( 183; GFX7-LABEL: flat_singlethread_monotonic_load: 184; GFX7: ; %bb.0: ; %entry 185; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 186; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 187; GFX7-NEXT: s_waitcnt lgkmcnt(0) 188; GFX7-NEXT: v_mov_b32_e32 v0, s6 189; GFX7-NEXT: v_mov_b32_e32 v1, s7 190; GFX7-NEXT: flat_load_dword v2, v[0:1] 191; GFX7-NEXT: v_mov_b32_e32 v0, s4 192; GFX7-NEXT: v_mov_b32_e32 v1, s5 193; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 194; GFX7-NEXT: flat_store_dword v[0:1], v2 195; GFX7-NEXT: s_endpgm 196; 197; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: 198; GFX10-WGP: ; %bb.0: ; %entry 199; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 200; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 201; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 202; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 203; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 204; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 205; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 206; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 207; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 208; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 209; GFX10-WGP-NEXT: s_endpgm 210; 211; GFX10-CU-LABEL: flat_singlethread_monotonic_load: 212; GFX10-CU: ; %bb.0: ; %entry 213; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 214; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 215; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 216; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 217; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 218; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 219; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 220; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 221; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 222; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 223; GFX10-CU-NEXT: s_endpgm 224; 225; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load: 226; SKIP-CACHE-INV: ; %bb.0: ; %entry 227; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 229; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 230; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 231; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 232; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 235; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 236; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 237; SKIP-CACHE-INV-NEXT: s_endpgm 238; 239; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: 240; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 241; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 242; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 243; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 244; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 245; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 246; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 247; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 248; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 249; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 250; 251; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: 252; GFX90A-TGSPLIT: ; %bb.0: ; %entry 253; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 254; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 255; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 256; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 257; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 258; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 259; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 260; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 261; GFX90A-TGSPLIT-NEXT: s_endpgm 262; 263; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: 264; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 265; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 266; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 267; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 268; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 269; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 270; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 271; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 272; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 273; GFX940-NOTTGSPLIT-NEXT: s_endpgm 274; 275; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load: 276; GFX940-TGSPLIT: ; %bb.0: ; %entry 277; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 278; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 279; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 280; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 281; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 282; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 283; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 284; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 285; GFX940-TGSPLIT-NEXT: s_endpgm 286; 287; GFX11-WGP-LABEL: flat_singlethread_monotonic_load: 288; GFX11-WGP: ; %bb.0: ; %entry 289; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 290; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 291; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 292; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 293; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 294; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 295; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 296; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 297; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 299; GFX11-WGP-NEXT: s_endpgm 300; 301; GFX11-CU-LABEL: flat_singlethread_monotonic_load: 302; GFX11-CU: ; %bb.0: ; %entry 303; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 304; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 305; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 306; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 307; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 308; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 309; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 310; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 311; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 312; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 313; GFX11-CU-NEXT: s_endpgm 314; 315; GFX12-WGP-LABEL: flat_singlethread_monotonic_load: 316; GFX12-WGP: ; %bb.0: ; %entry 317; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 318; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 319; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 320; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 321; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 322; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 323; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 324; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 325; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 326; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 327; GFX12-WGP-NEXT: s_endpgm 328; 329; GFX12-CU-LABEL: flat_singlethread_monotonic_load: 330; GFX12-CU: ; %bb.0: ; %entry 331; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 332; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 333; GFX12-CU-NEXT: s_wait_kmcnt 0x0 334; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 335; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 336; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 337; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 338; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 339; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 340; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 341; GFX12-CU-NEXT: s_endpgm 342 ptr %in, ptr %out) { 343entry: 344 %val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4 345 store i32 %val, ptr %out 346 ret void 347} 348 349define amdgpu_kernel void @flat_singlethread_acquire_load( 350; GFX7-LABEL: flat_singlethread_acquire_load: 351; GFX7: ; %bb.0: ; %entry 352; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 353; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 354; GFX7-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7-NEXT: v_mov_b32_e32 v0, s6 356; GFX7-NEXT: v_mov_b32_e32 v1, s7 357; GFX7-NEXT: flat_load_dword v2, v[0:1] 358; GFX7-NEXT: v_mov_b32_e32 v0, s4 359; GFX7-NEXT: v_mov_b32_e32 v1, s5 360; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 361; GFX7-NEXT: flat_store_dword v[0:1], v2 362; GFX7-NEXT: s_endpgm 363; 364; GFX10-WGP-LABEL: flat_singlethread_acquire_load: 365; GFX10-WGP: ; %bb.0: ; %entry 366; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 367; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 368; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 369; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 370; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 371; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 372; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 373; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 374; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 375; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 376; GFX10-WGP-NEXT: s_endpgm 377; 378; GFX10-CU-LABEL: flat_singlethread_acquire_load: 379; GFX10-CU: ; %bb.0: ; %entry 380; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 381; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 382; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 383; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 384; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 385; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 386; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 387; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 388; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 389; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 390; GFX10-CU-NEXT: s_endpgm 391; 392; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load: 393; SKIP-CACHE-INV: ; %bb.0: ; %entry 394; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 395; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 396; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 397; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 398; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 399; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 401; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 402; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 404; SKIP-CACHE-INV-NEXT: s_endpgm 405; 406; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: 407; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 408; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 409; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 410; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 411; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 412; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 413; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 414; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 415; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 416; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 417; 418; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: 419; GFX90A-TGSPLIT: ; %bb.0: ; %entry 420; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 421; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 422; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 423; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 424; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 425; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 426; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 427; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 428; GFX90A-TGSPLIT-NEXT: s_endpgm 429; 430; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: 431; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 432; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 433; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 434; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 435; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 436; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 437; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 438; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 439; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 440; GFX940-NOTTGSPLIT-NEXT: s_endpgm 441; 442; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load: 443; GFX940-TGSPLIT: ; %bb.0: ; %entry 444; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 445; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 446; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 447; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 448; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 449; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 450; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 451; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 452; GFX940-TGSPLIT-NEXT: s_endpgm 453; 454; GFX11-WGP-LABEL: flat_singlethread_acquire_load: 455; GFX11-WGP: ; %bb.0: ; %entry 456; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 457; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 458; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 459; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 460; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 461; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 462; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 463; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 464; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 465; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 466; GFX11-WGP-NEXT: s_endpgm 467; 468; GFX11-CU-LABEL: flat_singlethread_acquire_load: 469; GFX11-CU: ; %bb.0: ; %entry 470; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 471; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 472; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 473; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 474; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 475; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 476; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 477; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 478; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 479; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 480; GFX11-CU-NEXT: s_endpgm 481; 482; GFX12-WGP-LABEL: flat_singlethread_acquire_load: 483; GFX12-WGP: ; %bb.0: ; %entry 484; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 485; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 486; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 487; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 488; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 489; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 490; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 491; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 492; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 493; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 494; GFX12-WGP-NEXT: s_endpgm 495; 496; GFX12-CU-LABEL: flat_singlethread_acquire_load: 497; GFX12-CU: ; %bb.0: ; %entry 498; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 499; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 500; GFX12-CU-NEXT: s_wait_kmcnt 0x0 501; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 502; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 503; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 504; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 505; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 506; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 507; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 508; GFX12-CU-NEXT: s_endpgm 509 ptr %in, ptr %out) { 510entry: 511 %val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4 512 store i32 %val, ptr %out 513 ret void 514} 515 516define amdgpu_kernel void @flat_singlethread_seq_cst_load( 517; GFX7-LABEL: flat_singlethread_seq_cst_load: 518; GFX7: ; %bb.0: ; %entry 519; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 520; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 521; GFX7-NEXT: s_waitcnt lgkmcnt(0) 522; GFX7-NEXT: v_mov_b32_e32 v0, s6 523; GFX7-NEXT: v_mov_b32_e32 v1, s7 524; GFX7-NEXT: flat_load_dword v2, v[0:1] 525; GFX7-NEXT: v_mov_b32_e32 v0, s4 526; GFX7-NEXT: v_mov_b32_e32 v1, s5 527; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 528; GFX7-NEXT: flat_store_dword v[0:1], v2 529; GFX7-NEXT: s_endpgm 530; 531; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: 532; GFX10-WGP: ; %bb.0: ; %entry 533; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 534; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 535; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 536; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 537; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 538; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 539; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 540; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 541; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 542; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 543; GFX10-WGP-NEXT: s_endpgm 544; 545; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: 546; GFX10-CU: ; %bb.0: ; %entry 547; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 548; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 549; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 550; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 551; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 552; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 553; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 554; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 555; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 556; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 557; GFX10-CU-NEXT: s_endpgm 558; 559; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load: 560; SKIP-CACHE-INV: ; %bb.0: ; %entry 561; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 562; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 563; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 564; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 565; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 566; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 567; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 568; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 569; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 571; SKIP-CACHE-INV-NEXT: s_endpgm 572; 573; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: 574; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 575; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 576; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 577; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 578; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 579; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 580; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 581; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 582; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 583; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 584; 585; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: 586; GFX90A-TGSPLIT: ; %bb.0: ; %entry 587; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 588; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 589; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 590; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 591; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 592; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 593; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 594; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 595; GFX90A-TGSPLIT-NEXT: s_endpgm 596; 597; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: 598; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 599; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 600; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 601; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 602; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 603; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 604; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 605; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 606; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 607; GFX940-NOTTGSPLIT-NEXT: s_endpgm 608; 609; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: 610; GFX940-TGSPLIT: ; %bb.0: ; %entry 611; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 612; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 613; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 614; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 615; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 616; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 617; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 618; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 619; GFX940-TGSPLIT-NEXT: s_endpgm 620; 621; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load: 622; GFX11-WGP: ; %bb.0: ; %entry 623; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 624; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 625; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 626; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 627; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 628; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 629; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 630; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 631; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 632; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 633; GFX11-WGP-NEXT: s_endpgm 634; 635; GFX11-CU-LABEL: flat_singlethread_seq_cst_load: 636; GFX11-CU: ; %bb.0: ; %entry 637; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 638; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 639; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 640; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 641; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 642; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 643; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 644; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 645; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 646; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 647; GFX11-CU-NEXT: s_endpgm 648; 649; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load: 650; GFX12-WGP: ; %bb.0: ; %entry 651; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 652; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 653; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 654; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 655; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 656; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 657; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 658; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 659; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 660; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 661; GFX12-WGP-NEXT: s_endpgm 662; 663; GFX12-CU-LABEL: flat_singlethread_seq_cst_load: 664; GFX12-CU: ; %bb.0: ; %entry 665; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 666; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 667; GFX12-CU-NEXT: s_wait_kmcnt 0x0 668; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 669; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 670; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 671; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 672; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 673; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 674; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 675; GFX12-CU-NEXT: s_endpgm 676 ptr %in, ptr %out) { 677entry: 678 %val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4 679 store i32 %val, ptr %out 680 ret void 681} 682 683define amdgpu_kernel void @flat_singlethread_unordered_store( 684; GFX7-LABEL: flat_singlethread_unordered_store: 685; GFX7: ; %bb.0: ; %entry 686; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 687; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 688; GFX7-NEXT: s_waitcnt lgkmcnt(0) 689; GFX7-NEXT: v_mov_b32_e32 v0, s6 690; GFX7-NEXT: v_mov_b32_e32 v1, s7 691; GFX7-NEXT: v_mov_b32_e32 v2, s4 692; GFX7-NEXT: flat_store_dword v[0:1], v2 693; GFX7-NEXT: s_endpgm 694; 695; GFX10-WGP-LABEL: flat_singlethread_unordered_store: 696; GFX10-WGP: ; %bb.0: ; %entry 697; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 698; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 699; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 700; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 701; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 702; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 703; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 704; GFX10-WGP-NEXT: s_endpgm 705; 706; GFX10-CU-LABEL: flat_singlethread_unordered_store: 707; GFX10-CU: ; %bb.0: ; %entry 708; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 709; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 710; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 711; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 712; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 713; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 714; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 715; GFX10-CU-NEXT: s_endpgm 716; 717; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: 718; SKIP-CACHE-INV: ; %bb.0: ; %entry 719; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 720; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 721; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 722; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 723; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 724; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 725; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 726; SKIP-CACHE-INV-NEXT: s_endpgm 727; 728; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: 729; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 730; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 731; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 732; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 733; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 734; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 735; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 736; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 737; 738; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: 739; GFX90A-TGSPLIT: ; %bb.0: ; %entry 740; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 741; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 742; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 743; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 744; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 745; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 746; GFX90A-TGSPLIT-NEXT: s_endpgm 747; 748; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: 749; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 750; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 751; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 752; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 753; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 754; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 755; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 756; GFX940-NOTTGSPLIT-NEXT: s_endpgm 757; 758; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store: 759; GFX940-TGSPLIT: ; %bb.0: ; %entry 760; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 761; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 762; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 763; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 764; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 765; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 766; GFX940-TGSPLIT-NEXT: s_endpgm 767; 768; GFX11-WGP-LABEL: flat_singlethread_unordered_store: 769; GFX11-WGP: ; %bb.0: ; %entry 770; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 771; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 772; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 773; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 774; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 775; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 776; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 777; GFX11-WGP-NEXT: s_endpgm 778; 779; GFX11-CU-LABEL: flat_singlethread_unordered_store: 780; GFX11-CU: ; %bb.0: ; %entry 781; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 782; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 783; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 784; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 785; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 786; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 787; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 788; GFX11-CU-NEXT: s_endpgm 789; 790; GFX12-WGP-LABEL: flat_singlethread_unordered_store: 791; GFX12-WGP: ; %bb.0: ; %entry 792; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 793; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 794; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 795; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 796; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 797; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 798; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 799; GFX12-WGP-NEXT: s_endpgm 800; 801; GFX12-CU-LABEL: flat_singlethread_unordered_store: 802; GFX12-CU: ; %bb.0: ; %entry 803; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 804; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 805; GFX12-CU-NEXT: s_wait_kmcnt 0x0 806; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 807; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 808; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 809; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 810; GFX12-CU-NEXT: s_endpgm 811 i32 %in, ptr %out) { 812entry: 813 store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 814 ret void 815} 816 817define amdgpu_kernel void @flat_singlethread_monotonic_store( 818; GFX7-LABEL: flat_singlethread_monotonic_store: 819; GFX7: ; %bb.0: ; %entry 820; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 821; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 822; GFX7-NEXT: s_waitcnt lgkmcnt(0) 823; GFX7-NEXT: v_mov_b32_e32 v0, s6 824; GFX7-NEXT: v_mov_b32_e32 v1, s7 825; GFX7-NEXT: v_mov_b32_e32 v2, s4 826; GFX7-NEXT: flat_store_dword v[0:1], v2 827; GFX7-NEXT: s_endpgm 828; 829; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: 830; GFX10-WGP: ; %bb.0: ; %entry 831; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 832; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 833; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 834; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 835; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 836; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 837; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 838; GFX10-WGP-NEXT: s_endpgm 839; 840; GFX10-CU-LABEL: flat_singlethread_monotonic_store: 841; GFX10-CU: ; %bb.0: ; %entry 842; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 843; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 844; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 845; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 846; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 847; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 848; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 849; GFX10-CU-NEXT: s_endpgm 850; 851; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: 852; SKIP-CACHE-INV: ; %bb.0: ; %entry 853; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 854; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 855; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 856; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 857; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 858; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 859; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 860; SKIP-CACHE-INV-NEXT: s_endpgm 861; 862; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: 863; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 864; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 865; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 866; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 867; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 868; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 869; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 870; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 871; 872; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: 873; GFX90A-TGSPLIT: ; %bb.0: ; %entry 874; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 875; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 876; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 877; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 878; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 879; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 880; GFX90A-TGSPLIT-NEXT: s_endpgm 881; 882; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: 883; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 884; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 885; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 886; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 887; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 888; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 889; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 890; GFX940-NOTTGSPLIT-NEXT: s_endpgm 891; 892; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store: 893; GFX940-TGSPLIT: ; %bb.0: ; %entry 894; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 895; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 896; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 897; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 898; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 899; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 900; GFX940-TGSPLIT-NEXT: s_endpgm 901; 902; GFX11-WGP-LABEL: flat_singlethread_monotonic_store: 903; GFX11-WGP: ; %bb.0: ; %entry 904; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 905; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 906; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 907; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 908; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 909; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 910; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 911; GFX11-WGP-NEXT: s_endpgm 912; 913; GFX11-CU-LABEL: flat_singlethread_monotonic_store: 914; GFX11-CU: ; %bb.0: ; %entry 915; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 916; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 917; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 918; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 919; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 920; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 921; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 922; GFX11-CU-NEXT: s_endpgm 923; 924; GFX12-WGP-LABEL: flat_singlethread_monotonic_store: 925; GFX12-WGP: ; %bb.0: ; %entry 926; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 927; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 928; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 929; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 930; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 931; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 932; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 933; GFX12-WGP-NEXT: s_endpgm 934; 935; GFX12-CU-LABEL: flat_singlethread_monotonic_store: 936; GFX12-CU: ; %bb.0: ; %entry 937; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 938; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 939; GFX12-CU-NEXT: s_wait_kmcnt 0x0 940; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 941; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 942; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 943; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 944; GFX12-CU-NEXT: s_endpgm 945 i32 %in, ptr %out) { 946entry: 947 store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 948 ret void 949} 950 951define amdgpu_kernel void @flat_singlethread_release_store( 952; GFX7-LABEL: flat_singlethread_release_store: 953; GFX7: ; %bb.0: ; %entry 954; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 955; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 956; GFX7-NEXT: s_waitcnt lgkmcnt(0) 957; GFX7-NEXT: v_mov_b32_e32 v0, s6 958; GFX7-NEXT: v_mov_b32_e32 v1, s7 959; GFX7-NEXT: v_mov_b32_e32 v2, s4 960; GFX7-NEXT: flat_store_dword v[0:1], v2 961; GFX7-NEXT: s_endpgm 962; 963; GFX10-WGP-LABEL: flat_singlethread_release_store: 964; GFX10-WGP: ; %bb.0: ; %entry 965; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 966; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 967; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 968; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 969; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 970; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 971; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 972; GFX10-WGP-NEXT: s_endpgm 973; 974; GFX10-CU-LABEL: flat_singlethread_release_store: 975; GFX10-CU: ; %bb.0: ; %entry 976; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 977; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 978; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 979; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 980; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 981; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 982; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 983; GFX10-CU-NEXT: s_endpgm 984; 985; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: 986; SKIP-CACHE-INV: ; %bb.0: ; %entry 987; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 988; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 989; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 990; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 991; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 992; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 993; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 994; SKIP-CACHE-INV-NEXT: s_endpgm 995; 996; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: 997; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 998; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 999; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1000; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1001; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1002; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1003; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1004; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1005; 1006; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: 1007; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1008; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1009; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1010; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1012; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1013; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1014; GFX90A-TGSPLIT-NEXT: s_endpgm 1015; 1016; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store: 1017; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1018; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1019; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1020; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1022; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1023; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1024; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1025; 1026; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store: 1027; GFX940-TGSPLIT: ; %bb.0: ; %entry 1028; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1029; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1030; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1032; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1033; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1034; GFX940-TGSPLIT-NEXT: s_endpgm 1035; 1036; GFX11-WGP-LABEL: flat_singlethread_release_store: 1037; GFX11-WGP: ; %bb.0: ; %entry 1038; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1039; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1040; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1042; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1043; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1044; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1045; GFX11-WGP-NEXT: s_endpgm 1046; 1047; GFX11-CU-LABEL: flat_singlethread_release_store: 1048; GFX11-CU: ; %bb.0: ; %entry 1049; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1050; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1051; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1053; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1054; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1055; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1056; GFX11-CU-NEXT: s_endpgm 1057; 1058; GFX12-WGP-LABEL: flat_singlethread_release_store: 1059; GFX12-WGP: ; %bb.0: ; %entry 1060; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1061; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1062; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1063; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1064; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1065; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1066; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 1067; GFX12-WGP-NEXT: s_endpgm 1068; 1069; GFX12-CU-LABEL: flat_singlethread_release_store: 1070; GFX12-CU: ; %bb.0: ; %entry 1071; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1072; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1073; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1074; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1075; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1076; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1077; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 1078; GFX12-CU-NEXT: s_endpgm 1079 i32 %in, ptr %out) { 1080entry: 1081 store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 1082 ret void 1083} 1084 1085define amdgpu_kernel void @flat_singlethread_seq_cst_store( 1086; GFX7-LABEL: flat_singlethread_seq_cst_store: 1087; GFX7: ; %bb.0: ; %entry 1088; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 1089; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 1090; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX7-NEXT: v_mov_b32_e32 v0, s6 1092; GFX7-NEXT: v_mov_b32_e32 v1, s7 1093; GFX7-NEXT: v_mov_b32_e32 v2, s4 1094; GFX7-NEXT: flat_store_dword v[0:1], v2 1095; GFX7-NEXT: s_endpgm 1096; 1097; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: 1098; GFX10-WGP: ; %bb.0: ; %entry 1099; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 1100; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1101; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1103; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1104; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1105; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1106; GFX10-WGP-NEXT: s_endpgm 1107; 1108; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: 1109; GFX10-CU: ; %bb.0: ; %entry 1110; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 1111; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1112; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1114; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1115; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1116; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1117; GFX10-CU-NEXT: s_endpgm 1118; 1119; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: 1120; SKIP-CACHE-INV: ; %bb.0: ; %entry 1121; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 1122; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1123; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1124; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1125; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1126; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1127; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1128; SKIP-CACHE-INV-NEXT: s_endpgm 1129; 1130; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: 1131; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1132; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1133; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1134; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1136; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1137; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1138; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1139; 1140; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: 1141; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1142; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 1143; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 1144; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1146; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1147; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1148; GFX90A-TGSPLIT-NEXT: s_endpgm 1149; 1150; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: 1151; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1152; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1153; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1154; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1156; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1157; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1158; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1159; 1160; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: 1161; GFX940-TGSPLIT: ; %bb.0: ; %entry 1162; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 1163; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1164; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1166; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1167; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1168; GFX940-TGSPLIT-NEXT: s_endpgm 1169; 1170; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store: 1171; GFX11-WGP: ; %bb.0: ; %entry 1172; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1173; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1174; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1176; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1177; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1178; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 1179; GFX11-WGP-NEXT: s_endpgm 1180; 1181; GFX11-CU-LABEL: flat_singlethread_seq_cst_store: 1182; GFX11-CU: ; %bb.0: ; %entry 1183; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1184; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1185; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1186; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1187; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1188; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1189; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 1190; GFX11-CU-NEXT: s_endpgm 1191; 1192; GFX12-WGP-LABEL: flat_singlethread_seq_cst_store: 1193; GFX12-WGP: ; %bb.0: ; %entry 1194; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 1195; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1196; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1197; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1198; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1199; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1200; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 1201; GFX12-WGP-NEXT: s_endpgm 1202; 1203; GFX12-CU-LABEL: flat_singlethread_seq_cst_store: 1204; GFX12-CU: ; %bb.0: ; %entry 1205; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 1206; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 1207; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1208; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1209; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1210; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1211; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 1212; GFX12-CU-NEXT: s_endpgm 1213 i32 %in, ptr %out) { 1214entry: 1215 store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 1216 ret void 1217} 1218 1219define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( 1220; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: 1221; GFX7: ; %bb.0: ; %entry 1222; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1223; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1224; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX7-NEXT: v_mov_b32_e32 v0, s6 1226; GFX7-NEXT: v_mov_b32_e32 v1, s7 1227; GFX7-NEXT: v_mov_b32_e32 v2, s4 1228; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1229; GFX7-NEXT: s_endpgm 1230; 1231; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 1232; GFX10-WGP: ; %bb.0: ; %entry 1233; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1234; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1235; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1237; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1238; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1239; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1240; GFX10-WGP-NEXT: s_endpgm 1241; 1242; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 1243; GFX10-CU: ; %bb.0: ; %entry 1244; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1245; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1246; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1247; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1248; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1249; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1250; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1251; GFX10-CU-NEXT: s_endpgm 1252; 1253; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw: 1254; SKIP-CACHE-INV: ; %bb.0: ; %entry 1255; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1256; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1257; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1258; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1259; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1260; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1261; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1262; SKIP-CACHE-INV-NEXT: s_endpgm 1263; 1264; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1265; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1266; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1267; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1268; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1269; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1270; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1271; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1272; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1273; 1274; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1275; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1276; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1277; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1278; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1280; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1281; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1282; GFX90A-TGSPLIT-NEXT: s_endpgm 1283; 1284; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1285; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1286; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1287; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1288; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1290; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1291; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1292; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1293; 1294; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: 1295; GFX940-TGSPLIT: ; %bb.0: ; %entry 1296; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1297; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1298; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1299; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1300; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1301; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1302; GFX940-TGSPLIT-NEXT: s_endpgm 1303; 1304; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 1305; GFX11-WGP: ; %bb.0: ; %entry 1306; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1307; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1308; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1310; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1311; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1312; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1313; GFX11-WGP-NEXT: s_endpgm 1314; 1315; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 1316; GFX11-CU: ; %bb.0: ; %entry 1317; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1318; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1319; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1320; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1321; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1322; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1323; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1324; GFX11-CU-NEXT: s_endpgm 1325; 1326; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: 1327; GFX12-WGP: ; %bb.0: ; %entry 1328; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1329; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1330; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1331; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1332; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1333; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1334; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1335; GFX12-WGP-NEXT: s_endpgm 1336; 1337; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw: 1338; GFX12-CU: ; %bb.0: ; %entry 1339; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1340; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1341; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1342; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1343; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1344; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1345; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1346; GFX12-CU-NEXT: s_endpgm 1347 ptr %out, i32 %in) { 1348entry: 1349 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic 1350 ret void 1351} 1352 1353define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( 1354; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: 1355; GFX7: ; %bb.0: ; %entry 1356; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1357; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1358; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX7-NEXT: v_mov_b32_e32 v0, s6 1360; GFX7-NEXT: v_mov_b32_e32 v1, s7 1361; GFX7-NEXT: v_mov_b32_e32 v2, s4 1362; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1363; GFX7-NEXT: s_endpgm 1364; 1365; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 1366; GFX10-WGP: ; %bb.0: ; %entry 1367; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1368; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1369; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1371; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1372; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1373; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1374; GFX10-WGP-NEXT: s_endpgm 1375; 1376; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: 1377; GFX10-CU: ; %bb.0: ; %entry 1378; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1379; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1380; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1382; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1383; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1384; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1385; GFX10-CU-NEXT: s_endpgm 1386; 1387; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: 1388; SKIP-CACHE-INV: ; %bb.0: ; %entry 1389; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1390; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1391; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1392; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1393; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1394; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1395; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1396; SKIP-CACHE-INV-NEXT: s_endpgm 1397; 1398; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1399; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1400; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1401; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1402; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1404; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1405; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1406; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1407; 1408; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1409; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1410; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1411; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1412; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1413; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1414; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1415; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1416; GFX90A-TGSPLIT-NEXT: s_endpgm 1417; 1418; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1419; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1420; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1421; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1422; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1423; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1424; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1425; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1426; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1427; 1428; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: 1429; GFX940-TGSPLIT: ; %bb.0: ; %entry 1430; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1431; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1432; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1434; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1435; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1436; GFX940-TGSPLIT-NEXT: s_endpgm 1437; 1438; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 1439; GFX11-WGP: ; %bb.0: ; %entry 1440; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1441; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1442; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1444; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1445; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1446; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1447; GFX11-WGP-NEXT: s_endpgm 1448; 1449; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: 1450; GFX11-CU: ; %bb.0: ; %entry 1451; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1452; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1453; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1454; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1455; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1456; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1457; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1458; GFX11-CU-NEXT: s_endpgm 1459; 1460; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw: 1461; GFX12-WGP: ; %bb.0: ; %entry 1462; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1463; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1464; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1465; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1466; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1467; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1468; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1469; GFX12-WGP-NEXT: s_endpgm 1470; 1471; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw: 1472; GFX12-CU: ; %bb.0: ; %entry 1473; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1474; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1475; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1476; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1477; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1478; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1479; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1480; GFX12-CU-NEXT: s_endpgm 1481 ptr %out, i32 %in) { 1482entry: 1483 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire 1484 ret void 1485} 1486 1487define amdgpu_kernel void @flat_singlethread_release_atomicrmw( 1488; GFX7-LABEL: flat_singlethread_release_atomicrmw: 1489; GFX7: ; %bb.0: ; %entry 1490; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1491; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1492; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1493; GFX7-NEXT: v_mov_b32_e32 v0, s6 1494; GFX7-NEXT: v_mov_b32_e32 v1, s7 1495; GFX7-NEXT: v_mov_b32_e32 v2, s4 1496; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1497; GFX7-NEXT: s_endpgm 1498; 1499; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: 1500; GFX10-WGP: ; %bb.0: ; %entry 1501; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1502; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1503; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1504; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1505; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1506; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1507; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1508; GFX10-WGP-NEXT: s_endpgm 1509; 1510; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: 1511; GFX10-CU: ; %bb.0: ; %entry 1512; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1513; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1514; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1516; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1517; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1518; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1519; GFX10-CU-NEXT: s_endpgm 1520; 1521; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw: 1522; SKIP-CACHE-INV: ; %bb.0: ; %entry 1523; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1524; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1525; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1526; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1527; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1528; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1529; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1530; SKIP-CACHE-INV-NEXT: s_endpgm 1531; 1532; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1533; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1534; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1535; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1536; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1538; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1539; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1540; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1541; 1542; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1543; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1544; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1545; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1546; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1548; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1549; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1550; GFX90A-TGSPLIT-NEXT: s_endpgm 1551; 1552; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1553; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1554; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1555; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1556; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1557; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1558; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1559; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1560; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1561; 1562; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: 1563; GFX940-TGSPLIT: ; %bb.0: ; %entry 1564; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1565; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1566; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1567; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1568; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1569; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1570; GFX940-TGSPLIT-NEXT: s_endpgm 1571; 1572; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw: 1573; GFX11-WGP: ; %bb.0: ; %entry 1574; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1575; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1576; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1577; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1578; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1579; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1580; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1581; GFX11-WGP-NEXT: s_endpgm 1582; 1583; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw: 1584; GFX11-CU: ; %bb.0: ; %entry 1585; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1586; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1587; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1588; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1589; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1590; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1591; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1592; GFX11-CU-NEXT: s_endpgm 1593; 1594; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw: 1595; GFX12-WGP: ; %bb.0: ; %entry 1596; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1597; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1598; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1599; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1600; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1601; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1602; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1603; GFX12-WGP-NEXT: s_endpgm 1604; 1605; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw: 1606; GFX12-CU: ; %bb.0: ; %entry 1607; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1608; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1609; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1610; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1611; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1612; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1613; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1614; GFX12-CU-NEXT: s_endpgm 1615 ptr %out, i32 %in) { 1616entry: 1617 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release 1618 ret void 1619} 1620 1621define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( 1622; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: 1623; GFX7: ; %bb.0: ; %entry 1624; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1625; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1626; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1627; GFX7-NEXT: v_mov_b32_e32 v0, s6 1628; GFX7-NEXT: v_mov_b32_e32 v1, s7 1629; GFX7-NEXT: v_mov_b32_e32 v2, s4 1630; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1631; GFX7-NEXT: s_endpgm 1632; 1633; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 1634; GFX10-WGP: ; %bb.0: ; %entry 1635; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1636; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1637; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1638; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1639; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1640; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1641; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1642; GFX10-WGP-NEXT: s_endpgm 1643; 1644; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 1645; GFX10-CU: ; %bb.0: ; %entry 1646; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1647; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1648; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1650; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1651; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1652; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1653; GFX10-CU-NEXT: s_endpgm 1654; 1655; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: 1656; SKIP-CACHE-INV: ; %bb.0: ; %entry 1657; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1658; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1659; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1660; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1661; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1662; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1663; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1664; SKIP-CACHE-INV-NEXT: s_endpgm 1665; 1666; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1667; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1668; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1669; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1670; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1672; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1673; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1674; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1675; 1676; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1677; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1678; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1679; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1680; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1681; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1682; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1683; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1684; GFX90A-TGSPLIT-NEXT: s_endpgm 1685; 1686; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1687; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1688; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1689; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1690; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1691; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1692; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1693; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1694; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1695; 1696; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: 1697; GFX940-TGSPLIT: ; %bb.0: ; %entry 1698; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1699; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1700; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1701; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1702; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1703; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1704; GFX940-TGSPLIT-NEXT: s_endpgm 1705; 1706; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 1707; GFX11-WGP: ; %bb.0: ; %entry 1708; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1709; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1710; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1712; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1713; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1714; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1715; GFX11-WGP-NEXT: s_endpgm 1716; 1717; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 1718; GFX11-CU: ; %bb.0: ; %entry 1719; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1720; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1721; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1722; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1723; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1724; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1725; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1726; GFX11-CU-NEXT: s_endpgm 1727; 1728; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: 1729; GFX12-WGP: ; %bb.0: ; %entry 1730; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1731; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1732; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1733; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1734; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1735; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1736; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1737; GFX12-WGP-NEXT: s_endpgm 1738; 1739; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: 1740; GFX12-CU: ; %bb.0: ; %entry 1741; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1742; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1743; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1744; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1745; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1746; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1747; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1748; GFX12-CU-NEXT: s_endpgm 1749 ptr %out, i32 %in) { 1750entry: 1751 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel 1752 ret void 1753} 1754 1755define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( 1756; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: 1757; GFX7: ; %bb.0: ; %entry 1758; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1759; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 1760; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX7-NEXT: v_mov_b32_e32 v0, s6 1762; GFX7-NEXT: v_mov_b32_e32 v1, s7 1763; GFX7-NEXT: v_mov_b32_e32 v2, s4 1764; GFX7-NEXT: flat_atomic_swap v[0:1], v2 1765; GFX7-NEXT: s_endpgm 1766; 1767; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 1768; GFX10-WGP: ; %bb.0: ; %entry 1769; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1770; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 1771; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1772; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 1773; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 1774; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 1775; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 1776; GFX10-WGP-NEXT: s_endpgm 1777; 1778; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 1779; GFX10-CU: ; %bb.0: ; %entry 1780; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1781; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 1782; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 1784; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 1785; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 1786; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 1787; GFX10-CU-NEXT: s_endpgm 1788; 1789; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: 1790; SKIP-CACHE-INV: ; %bb.0: ; %entry 1791; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1792; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 1793; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1794; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1795; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1796; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1797; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 1798; SKIP-CACHE-INV-NEXT: s_endpgm 1799; 1800; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1801; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1802; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1803; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1804; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1806; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1807; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1808; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1809; 1810; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1811; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1812; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 1813; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 1814; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 1816; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 1817; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1818; GFX90A-TGSPLIT-NEXT: s_endpgm 1819; 1820; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1821; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1822; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1823; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1824; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1826; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1827; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1828; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1829; 1830; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: 1831; GFX940-TGSPLIT: ; %bb.0: ; %entry 1832; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1833; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 1834; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1835; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 1836; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 1837; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 1838; GFX940-TGSPLIT-NEXT: s_endpgm 1839; 1840; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 1841; GFX11-WGP: ; %bb.0: ; %entry 1842; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1843; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1844; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 1846; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 1847; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 1848; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1849; GFX11-WGP-NEXT: s_endpgm 1850; 1851; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 1852; GFX11-CU: ; %bb.0: ; %entry 1853; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1854; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1855; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1856; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 1857; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 1858; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 1859; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1860; GFX11-CU-NEXT: s_endpgm 1861; 1862; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: 1863; GFX12-WGP: ; %bb.0: ; %entry 1864; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1865; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 1866; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1867; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 1868; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 1869; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 1870; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 1871; GFX12-WGP-NEXT: s_endpgm 1872; 1873; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: 1874; GFX12-CU: ; %bb.0: ; %entry 1875; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 1876; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 1877; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1878; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 1879; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 1880; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 1881; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 1882; GFX12-CU-NEXT: s_endpgm 1883 ptr %out, i32 %in) { 1884entry: 1885 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst 1886 ret void 1887} 1888 1889define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( 1890; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1891; GFX7: ; %bb.0: ; %entry 1892; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1893; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 1894; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1895; GFX7-NEXT: v_mov_b32_e32 v0, s4 1896; GFX7-NEXT: v_mov_b32_e32 v1, s5 1897; GFX7-NEXT: v_mov_b32_e32 v2, s6 1898; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1899; GFX7-NEXT: v_mov_b32_e32 v0, s4 1900; GFX7-NEXT: v_mov_b32_e32 v1, s5 1901; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1902; GFX7-NEXT: flat_store_dword v[0:1], v2 1903; GFX7-NEXT: s_endpgm 1904; 1905; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1906; GFX10-WGP: ; %bb.0: ; %entry 1907; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1908; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 1909; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1910; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 1911; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 1912; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 1913; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1914; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 1915; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 1916; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1917; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1918; GFX10-WGP-NEXT: s_endpgm 1919; 1920; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1921; GFX10-CU: ; %bb.0: ; %entry 1922; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1923; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 1924; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 1926; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 1927; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 1928; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1929; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 1930; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 1931; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1932; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1933; GFX10-CU-NEXT: s_endpgm 1934; 1935; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1936; SKIP-CACHE-INV: ; %bb.0: ; %entry 1937; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1938; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 1939; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1940; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1941; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 1943; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1944; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1945; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1946; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1947; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1948; SKIP-CACHE-INV-NEXT: s_endpgm 1949; 1950; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1951; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 1952; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1953; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 1954; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1955; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 1956; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 1957; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1958; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 1959; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1960; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 1961; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 1962; 1963; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1964; GFX90A-TGSPLIT: ; %bb.0: ; %entry 1965; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1966; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 1967; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 1969; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 1970; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1971; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 1972; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1973; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 1974; GFX90A-TGSPLIT-NEXT: s_endpgm 1975; 1976; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1977; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 1978; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1979; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1980; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1981; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1982; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1983; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1984; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1985; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1986; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 1987; GFX940-NOTTGSPLIT-NEXT: s_endpgm 1988; 1989; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: 1990; GFX940-TGSPLIT: ; %bb.0: ; %entry 1991; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1992; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 1993; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 1994; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1995; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 1996; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 1997; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1998; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 1999; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2000; GFX940-TGSPLIT-NEXT: s_endpgm 2001; 2002; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 2003; GFX11-WGP: ; %bb.0: ; %entry 2004; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2005; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2006; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2008; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2009; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2010; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2011; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2012; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2013; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2014; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2015; GFX11-WGP-NEXT: s_endpgm 2016; 2017; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 2018; GFX11-CU: ; %bb.0: ; %entry 2019; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2020; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2021; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2022; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2023; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2024; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2025; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2026; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2027; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2028; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2029; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2030; GFX11-CU-NEXT: s_endpgm 2031; 2032; GFX12-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: 2033; GFX12-WGP: ; %bb.0: ; %entry 2034; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2035; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2036; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2037; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2038; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2039; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2040; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2041; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2042; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2043; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2044; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2045; GFX12-WGP-NEXT: s_endpgm 2046; 2047; GFX12-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: 2048; GFX12-CU: ; %bb.0: ; %entry 2049; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2050; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2051; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2052; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2053; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2054; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2055; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2056; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2057; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2058; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 2059; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2060; GFX12-CU-NEXT: s_endpgm 2061 ptr %out, i32 %in) { 2062entry: 2063 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire 2064 store i32 %val, ptr %out, align 4 2065 ret void 2066} 2067 2068define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( 2069; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2070; GFX7: ; %bb.0: ; %entry 2071; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2072; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 2073; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2074; GFX7-NEXT: v_mov_b32_e32 v0, s4 2075; GFX7-NEXT: v_mov_b32_e32 v1, s5 2076; GFX7-NEXT: v_mov_b32_e32 v2, s6 2077; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2078; GFX7-NEXT: v_mov_b32_e32 v0, s4 2079; GFX7-NEXT: v_mov_b32_e32 v1, s5 2080; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2081; GFX7-NEXT: flat_store_dword v[0:1], v2 2082; GFX7-NEXT: s_endpgm 2083; 2084; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2085; GFX10-WGP: ; %bb.0: ; %entry 2086; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2087; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 2088; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2089; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2090; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2091; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 2092; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2093; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2094; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2095; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2096; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2097; GFX10-WGP-NEXT: s_endpgm 2098; 2099; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2100; GFX10-CU: ; %bb.0: ; %entry 2101; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2102; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 2103; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2104; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2105; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2106; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 2107; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2108; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2109; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2110; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2111; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2112; GFX10-CU-NEXT: s_endpgm 2113; 2114; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2115; SKIP-CACHE-INV: ; %bb.0: ; %entry 2116; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2117; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 2118; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2119; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2120; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2121; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2122; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2123; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2124; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2125; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2126; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2127; SKIP-CACHE-INV-NEXT: s_endpgm 2128; 2129; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2130; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2131; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2132; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2133; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2134; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2135; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2136; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2137; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2138; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2139; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2140; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2141; 2142; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2143; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2144; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2145; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2146; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2147; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2148; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2149; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2150; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2151; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2152; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2153; GFX90A-TGSPLIT-NEXT: s_endpgm 2154; 2155; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2156; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2157; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2158; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2159; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2161; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2162; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2163; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2164; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2165; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2166; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2167; 2168; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2169; GFX940-TGSPLIT: ; %bb.0: ; %entry 2170; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2171; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2172; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2173; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2174; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2175; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2176; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2177; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2178; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2179; GFX940-TGSPLIT-NEXT: s_endpgm 2180; 2181; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2182; GFX11-WGP: ; %bb.0: ; %entry 2183; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2184; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2185; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2186; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2187; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2188; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2189; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2190; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2191; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2192; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2193; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2194; GFX11-WGP-NEXT: s_endpgm 2195; 2196; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2197; GFX11-CU: ; %bb.0: ; %entry 2198; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2199; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2200; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2201; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2202; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2203; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2204; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2205; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2206; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2207; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2208; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2209; GFX11-CU-NEXT: s_endpgm 2210; 2211; GFX12-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2212; GFX12-WGP: ; %bb.0: ; %entry 2213; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2214; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2215; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2216; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2217; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2218; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2219; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2220; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2221; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2222; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2223; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2224; GFX12-WGP-NEXT: s_endpgm 2225; 2226; GFX12-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: 2227; GFX12-CU: ; %bb.0: ; %entry 2228; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2229; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2230; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2231; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2232; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2233; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2234; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2235; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2236; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2237; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 2238; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2239; GFX12-CU-NEXT: s_endpgm 2240 ptr %out, i32 %in) { 2241entry: 2242 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel 2243 store i32 %val, ptr %out, align 4 2244 ret void 2245} 2246 2247define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( 2248; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2249; GFX7: ; %bb.0: ; %entry 2250; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2251; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 2252; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2253; GFX7-NEXT: v_mov_b32_e32 v0, s4 2254; GFX7-NEXT: v_mov_b32_e32 v1, s5 2255; GFX7-NEXT: v_mov_b32_e32 v2, s6 2256; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2257; GFX7-NEXT: v_mov_b32_e32 v0, s4 2258; GFX7-NEXT: v_mov_b32_e32 v1, s5 2259; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2260; GFX7-NEXT: flat_store_dword v[0:1], v2 2261; GFX7-NEXT: s_endpgm 2262; 2263; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2264; GFX10-WGP: ; %bb.0: ; %entry 2265; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2266; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 2267; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2268; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2269; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2270; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 2271; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2272; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2273; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2274; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2275; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2276; GFX10-WGP-NEXT: s_endpgm 2277; 2278; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2279; GFX10-CU: ; %bb.0: ; %entry 2280; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2281; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 2282; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2283; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2284; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2285; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 2286; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2287; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2288; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2289; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2290; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2291; GFX10-CU-NEXT: s_endpgm 2292; 2293; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2294; SKIP-CACHE-INV: ; %bb.0: ; %entry 2295; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2296; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 2297; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2298; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2299; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2300; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2301; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2302; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2303; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2304; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2305; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2306; SKIP-CACHE-INV-NEXT: s_endpgm 2307; 2308; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2309; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2310; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2311; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2312; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2313; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2314; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2315; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2316; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2317; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2318; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 2319; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2320; 2321; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2322; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2323; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2324; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 2325; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2326; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2327; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 2328; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 2329; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2330; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2331; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 2332; GFX90A-TGSPLIT-NEXT: s_endpgm 2333; 2334; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2335; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2336; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2337; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2338; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2339; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2340; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2341; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2342; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2343; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2344; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2345; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2346; 2347; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2348; GFX940-TGSPLIT: ; %bb.0: ; %entry 2349; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2350; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 2351; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2352; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2353; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 2354; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 2355; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2356; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 2357; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 2358; GFX940-TGSPLIT-NEXT: s_endpgm 2359; 2360; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2361; GFX11-WGP: ; %bb.0: ; %entry 2362; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2363; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2364; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2365; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2366; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2367; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 2368; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2369; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2370; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2371; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2372; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 2373; GFX11-WGP-NEXT: s_endpgm 2374; 2375; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2376; GFX11-CU: ; %bb.0: ; %entry 2377; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2378; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2379; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2380; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2381; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2382; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 2383; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 2384; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2385; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2386; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2387; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 2388; GFX11-CU-NEXT: s_endpgm 2389; 2390; GFX12-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2391; GFX12-WGP: ; %bb.0: ; %entry 2392; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2393; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 2394; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2395; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2396; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2397; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 2398; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2399; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2400; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2401; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 2402; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 2403; GFX12-WGP-NEXT: s_endpgm 2404; 2405; GFX12-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: 2406; GFX12-CU: ; %bb.0: ; %entry 2407; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2408; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 2409; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2410; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2411; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2412; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 2413; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 2414; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2415; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2416; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 2417; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 2418; GFX12-CU-NEXT: s_endpgm 2419 ptr %out, i32 %in) { 2420entry: 2421 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst 2422 store i32 %val, ptr %out, align 4 2423 ret void 2424} 2425 2426define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( 2427; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2428; GFX7: ; %bb.0: ; %entry 2429; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2430; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2431; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 2432; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 2433; GFX7-NEXT: s_mov_b64 s[10:11], 16 2434; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2435; GFX7-NEXT: s_mov_b32 s4, s8 2436; GFX7-NEXT: s_mov_b32 s5, s9 2437; GFX7-NEXT: s_mov_b32 s9, s10 2438; GFX7-NEXT: s_mov_b32 s8, s11 2439; GFX7-NEXT: s_add_u32 s4, s4, s9 2440; GFX7-NEXT: s_addc_u32 s8, s5, s8 2441; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2442; GFX7-NEXT: s_mov_b32 s5, s8 2443; GFX7-NEXT: v_mov_b32_e32 v2, s7 2444; GFX7-NEXT: v_mov_b32_e32 v0, s6 2445; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2446; GFX7-NEXT: v_mov_b32_e32 v3, v0 2447; GFX7-NEXT: v_mov_b32_e32 v0, s4 2448; GFX7-NEXT: v_mov_b32_e32 v1, s5 2449; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2450; GFX7-NEXT: s_endpgm 2451; 2452; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2453; GFX10-WGP: ; %bb.0: ; %entry 2454; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 2455; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2456; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 2457; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 2458; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 2459; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2460; GFX10-WGP-NEXT: s_mov_b32 s4, s8 2461; GFX10-WGP-NEXT: s_mov_b32 s5, s9 2462; GFX10-WGP-NEXT: s_mov_b32 s9, s10 2463; GFX10-WGP-NEXT: s_mov_b32 s8, s11 2464; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 2465; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 2466; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2467; GFX10-WGP-NEXT: s_mov_b32 s5, s8 2468; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 2469; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 2470; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2471; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 2472; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2473; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2474; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2475; GFX10-WGP-NEXT: s_endpgm 2476; 2477; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2478; GFX10-CU: ; %bb.0: ; %entry 2479; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 2480; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2481; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 2482; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 2483; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 2484; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2485; GFX10-CU-NEXT: s_mov_b32 s4, s8 2486; GFX10-CU-NEXT: s_mov_b32 s5, s9 2487; GFX10-CU-NEXT: s_mov_b32 s9, s10 2488; GFX10-CU-NEXT: s_mov_b32 s8, s11 2489; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 2490; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 2491; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2492; GFX10-CU-NEXT: s_mov_b32 s5, s8 2493; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 2494; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 2495; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2496; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 2497; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2498; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2499; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2500; GFX10-CU-NEXT: s_endpgm 2501; 2502; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2503; SKIP-CACHE-INV: ; %bb.0: ; %entry 2504; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 2505; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 2506; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 2507; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 2508; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 2509; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2510; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2511; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2512; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 2513; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 2514; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 2515; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 2516; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 2517; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 2518; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2519; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2520; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2521; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 2522; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2523; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2524; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2525; SKIP-CACHE-INV-NEXT: s_endpgm 2526; 2527; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2528; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2529; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2530; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2531; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2532; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2534; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2535; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2536; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2537; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2538; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2539; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2540; 2541; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2542; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2543; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2544; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2545; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2546; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2547; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2548; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2549; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2550; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2551; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2552; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2553; GFX90A-TGSPLIT-NEXT: s_endpgm 2554; 2555; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2556; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2557; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2558; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2559; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2560; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2561; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2562; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2563; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2564; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2565; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2566; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2567; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2568; 2569; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2570; GFX940-TGSPLIT: ; %bb.0: ; %entry 2571; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2572; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2573; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2574; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2575; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2576; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2577; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2578; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2579; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2580; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2581; GFX940-TGSPLIT-NEXT: s_endpgm 2582; 2583; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2584; GFX11-WGP: ; %bb.0: ; %entry 2585; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2586; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2587; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2588; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2589; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 2590; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 2591; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2592; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 2593; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2594; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2595; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2596; GFX11-WGP-NEXT: s_endpgm 2597; 2598; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2599; GFX11-CU: ; %bb.0: ; %entry 2600; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2601; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2602; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2603; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2604; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 2605; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 2606; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2607; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 2608; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2609; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2610; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2611; GFX11-CU-NEXT: s_endpgm 2612; 2613; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2614; GFX12-WGP: ; %bb.0: ; %entry 2615; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2616; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2617; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2618; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2619; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 2620; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 2621; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2622; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 2623; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2624; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2625; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2626; GFX12-WGP-NEXT: s_endpgm 2627; 2628; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: 2629; GFX12-CU: ; %bb.0: ; %entry 2630; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2631; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2632; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2633; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2634; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 2635; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 2636; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2637; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 2638; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2639; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2640; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2641; GFX12-CU-NEXT: s_endpgm 2642 ptr %out, i32 %in, i32 %old) { 2643entry: 2644 %gep = getelementptr i32, ptr %out, i32 4 2645 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 2646 ret void 2647} 2648 2649define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( 2650; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2651; GFX7: ; %bb.0: ; %entry 2652; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2653; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2654; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 2655; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 2656; GFX7-NEXT: s_mov_b64 s[10:11], 16 2657; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX7-NEXT: s_mov_b32 s4, s8 2659; GFX7-NEXT: s_mov_b32 s5, s9 2660; GFX7-NEXT: s_mov_b32 s9, s10 2661; GFX7-NEXT: s_mov_b32 s8, s11 2662; GFX7-NEXT: s_add_u32 s4, s4, s9 2663; GFX7-NEXT: s_addc_u32 s8, s5, s8 2664; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2665; GFX7-NEXT: s_mov_b32 s5, s8 2666; GFX7-NEXT: v_mov_b32_e32 v2, s7 2667; GFX7-NEXT: v_mov_b32_e32 v0, s6 2668; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2669; GFX7-NEXT: v_mov_b32_e32 v3, v0 2670; GFX7-NEXT: v_mov_b32_e32 v0, s4 2671; GFX7-NEXT: v_mov_b32_e32 v1, s5 2672; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2673; GFX7-NEXT: s_endpgm 2674; 2675; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2676; GFX10-WGP: ; %bb.0: ; %entry 2677; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 2678; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2679; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 2680; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 2681; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 2682; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2683; GFX10-WGP-NEXT: s_mov_b32 s4, s8 2684; GFX10-WGP-NEXT: s_mov_b32 s5, s9 2685; GFX10-WGP-NEXT: s_mov_b32 s9, s10 2686; GFX10-WGP-NEXT: s_mov_b32 s8, s11 2687; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 2688; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 2689; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2690; GFX10-WGP-NEXT: s_mov_b32 s5, s8 2691; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 2692; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 2693; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2694; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 2695; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2696; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2697; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2698; GFX10-WGP-NEXT: s_endpgm 2699; 2700; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2701; GFX10-CU: ; %bb.0: ; %entry 2702; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 2703; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2704; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 2705; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 2706; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 2707; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2708; GFX10-CU-NEXT: s_mov_b32 s4, s8 2709; GFX10-CU-NEXT: s_mov_b32 s5, s9 2710; GFX10-CU-NEXT: s_mov_b32 s9, s10 2711; GFX10-CU-NEXT: s_mov_b32 s8, s11 2712; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 2713; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 2714; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2715; GFX10-CU-NEXT: s_mov_b32 s5, s8 2716; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 2717; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 2718; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2719; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 2720; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2721; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2722; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2723; GFX10-CU-NEXT: s_endpgm 2724; 2725; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2726; SKIP-CACHE-INV: ; %bb.0: ; %entry 2727; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 2728; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 2729; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 2730; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 2731; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 2732; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2733; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2734; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2735; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 2736; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 2737; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 2738; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 2739; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 2740; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 2741; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2742; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2743; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2744; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 2745; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2746; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2747; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2748; SKIP-CACHE-INV-NEXT: s_endpgm 2749; 2750; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2751; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2752; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2753; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2754; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2755; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2756; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2757; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2758; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2759; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2760; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2761; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2762; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2763; 2764; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2765; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2766; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2767; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2768; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2769; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2770; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2771; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2772; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2773; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2774; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2775; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2776; GFX90A-TGSPLIT-NEXT: s_endpgm 2777; 2778; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2779; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 2780; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2781; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2782; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2783; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2784; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2785; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2786; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2787; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2788; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2789; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2790; GFX940-NOTTGSPLIT-NEXT: s_endpgm 2791; 2792; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2793; GFX940-TGSPLIT: ; %bb.0: ; %entry 2794; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2795; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 2796; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 2797; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2798; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 2799; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 2800; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2801; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2802; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2803; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2804; GFX940-TGSPLIT-NEXT: s_endpgm 2805; 2806; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2807; GFX11-WGP: ; %bb.0: ; %entry 2808; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2809; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2810; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2811; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 2812; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 2813; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 2814; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2815; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 2816; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 2817; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 2818; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2819; GFX11-WGP-NEXT: s_endpgm 2820; 2821; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2822; GFX11-CU: ; %bb.0: ; %entry 2823; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2824; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2825; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2826; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 2828; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 2829; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2830; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 2831; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 2832; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 2833; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2834; GFX11-CU-NEXT: s_endpgm 2835; 2836; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2837; GFX12-WGP: ; %bb.0: ; %entry 2838; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2839; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 2840; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 2841; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 2842; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 2843; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 2844; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2845; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 2846; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 2847; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 2848; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2849; GFX12-WGP-NEXT: s_endpgm 2850; 2851; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: 2852; GFX12-CU: ; %bb.0: ; %entry 2853; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2854; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 2855; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 2856; GFX12-CU-NEXT: s_wait_kmcnt 0x0 2857; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 2858; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 2859; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2860; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 2861; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 2862; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 2863; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 2864; GFX12-CU-NEXT: s_endpgm 2865 ptr %out, i32 %in, i32 %old) { 2866entry: 2867 %gep = getelementptr i32, ptr %out, i32 4 2868 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 2869 ret void 2870} 2871 2872define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( 2873; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2874; GFX7: ; %bb.0: ; %entry 2875; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2876; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2877; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 2878; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 2879; GFX7-NEXT: s_mov_b64 s[10:11], 16 2880; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2881; GFX7-NEXT: s_mov_b32 s4, s8 2882; GFX7-NEXT: s_mov_b32 s5, s9 2883; GFX7-NEXT: s_mov_b32 s9, s10 2884; GFX7-NEXT: s_mov_b32 s8, s11 2885; GFX7-NEXT: s_add_u32 s4, s4, s9 2886; GFX7-NEXT: s_addc_u32 s8, s5, s8 2887; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2888; GFX7-NEXT: s_mov_b32 s5, s8 2889; GFX7-NEXT: v_mov_b32_e32 v2, s7 2890; GFX7-NEXT: v_mov_b32_e32 v0, s6 2891; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2892; GFX7-NEXT: v_mov_b32_e32 v3, v0 2893; GFX7-NEXT: v_mov_b32_e32 v0, s4 2894; GFX7-NEXT: v_mov_b32_e32 v1, s5 2895; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2896; GFX7-NEXT: s_endpgm 2897; 2898; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2899; GFX10-WGP: ; %bb.0: ; %entry 2900; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 2901; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2902; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 2903; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 2904; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 2905; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2906; GFX10-WGP-NEXT: s_mov_b32 s4, s8 2907; GFX10-WGP-NEXT: s_mov_b32 s5, s9 2908; GFX10-WGP-NEXT: s_mov_b32 s9, s10 2909; GFX10-WGP-NEXT: s_mov_b32 s8, s11 2910; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 2911; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 2912; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2913; GFX10-WGP-NEXT: s_mov_b32 s5, s8 2914; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 2915; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 2916; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2917; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 2918; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2919; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2920; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2921; GFX10-WGP-NEXT: s_endpgm 2922; 2923; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2924; GFX10-CU: ; %bb.0: ; %entry 2925; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 2926; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2927; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 2928; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 2929; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 2930; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2931; GFX10-CU-NEXT: s_mov_b32 s4, s8 2932; GFX10-CU-NEXT: s_mov_b32 s5, s9 2933; GFX10-CU-NEXT: s_mov_b32 s9, s10 2934; GFX10-CU-NEXT: s_mov_b32 s8, s11 2935; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 2936; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 2937; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 2938; GFX10-CU-NEXT: s_mov_b32 s5, s8 2939; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 2940; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 2941; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2942; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 2943; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2944; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2945; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2946; GFX10-CU-NEXT: s_endpgm 2947; 2948; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2949; SKIP-CACHE-INV: ; %bb.0: ; %entry 2950; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 2951; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 2952; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 2953; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 2954; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 2955; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2956; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2957; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2958; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 2959; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 2960; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 2961; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 2962; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 2963; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 2964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 2965; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2966; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2967; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 2968; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2969; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2970; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 2971; SKIP-CACHE-INV-NEXT: s_endpgm 2972; 2973; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2974; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 2975; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2976; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2977; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2978; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2979; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2980; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2981; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2982; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2983; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2984; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2985; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 2986; 2987; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 2988; GFX90A-TGSPLIT: ; %bb.0: ; %entry 2989; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2990; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 2991; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 2992; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 2993; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 2994; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 2995; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 2996; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 2997; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 2998; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 2999; GFX90A-TGSPLIT-NEXT: s_endpgm 3000; 3001; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3002; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3003; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3004; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3005; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3006; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3007; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3008; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3009; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3010; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3011; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3012; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3013; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3014; 3015; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3016; GFX940-TGSPLIT: ; %bb.0: ; %entry 3017; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3018; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3019; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3020; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3021; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3022; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3023; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3024; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3025; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3026; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3027; GFX940-TGSPLIT-NEXT: s_endpgm 3028; 3029; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3030; GFX11-WGP: ; %bb.0: ; %entry 3031; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3032; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3033; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3034; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3035; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3036; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3037; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3038; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3039; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3040; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3041; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3042; GFX11-WGP-NEXT: s_endpgm 3043; 3044; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3045; GFX11-CU: ; %bb.0: ; %entry 3046; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3047; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3048; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3049; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3050; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3051; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3052; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3053; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3054; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3055; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3056; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3057; GFX11-CU-NEXT: s_endpgm 3058; 3059; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3060; GFX12-WGP: ; %bb.0: ; %entry 3061; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3062; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3063; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3064; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3065; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3066; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3067; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3068; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3069; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3070; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3071; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3072; GFX12-WGP-NEXT: s_endpgm 3073; 3074; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: 3075; GFX12-CU: ; %bb.0: ; %entry 3076; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3077; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3078; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3079; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3080; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3081; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3082; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3083; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3084; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3085; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3086; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3087; GFX12-CU-NEXT: s_endpgm 3088 ptr %out, i32 %in, i32 %old) { 3089entry: 3090 %gep = getelementptr i32, ptr %out, i32 4 3091 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 3092 ret void 3093} 3094 3095define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( 3096; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3097; GFX7: ; %bb.0: ; %entry 3098; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3099; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3100; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3101; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3102; GFX7-NEXT: s_mov_b64 s[10:11], 16 3103; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3104; GFX7-NEXT: s_mov_b32 s4, s8 3105; GFX7-NEXT: s_mov_b32 s5, s9 3106; GFX7-NEXT: s_mov_b32 s9, s10 3107; GFX7-NEXT: s_mov_b32 s8, s11 3108; GFX7-NEXT: s_add_u32 s4, s4, s9 3109; GFX7-NEXT: s_addc_u32 s8, s5, s8 3110; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3111; GFX7-NEXT: s_mov_b32 s5, s8 3112; GFX7-NEXT: v_mov_b32_e32 v2, s7 3113; GFX7-NEXT: v_mov_b32_e32 v0, s6 3114; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3115; GFX7-NEXT: v_mov_b32_e32 v3, v0 3116; GFX7-NEXT: v_mov_b32_e32 v0, s4 3117; GFX7-NEXT: v_mov_b32_e32 v1, s5 3118; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3119; GFX7-NEXT: s_endpgm 3120; 3121; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3122; GFX10-WGP: ; %bb.0: ; %entry 3123; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3124; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3125; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3126; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3127; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3128; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3129; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3130; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3131; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3132; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3133; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3134; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3135; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3136; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3137; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3138; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3139; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3140; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3141; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3142; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3143; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3144; GFX10-WGP-NEXT: s_endpgm 3145; 3146; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3147; GFX10-CU: ; %bb.0: ; %entry 3148; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3149; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3150; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3151; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3152; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3153; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX10-CU-NEXT: s_mov_b32 s4, s8 3155; GFX10-CU-NEXT: s_mov_b32 s5, s9 3156; GFX10-CU-NEXT: s_mov_b32 s9, s10 3157; GFX10-CU-NEXT: s_mov_b32 s8, s11 3158; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3159; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3160; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3161; GFX10-CU-NEXT: s_mov_b32 s5, s8 3162; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3163; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3164; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3165; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3166; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3167; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3168; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3169; GFX10-CU-NEXT: s_endpgm 3170; 3171; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3172; SKIP-CACHE-INV: ; %bb.0: ; %entry 3173; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3174; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3175; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3176; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3177; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3178; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3179; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3180; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3181; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3182; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3183; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3184; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3185; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3186; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3187; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3188; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3189; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3190; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3191; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3192; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3193; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3194; SKIP-CACHE-INV-NEXT: s_endpgm 3195; 3196; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3197; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3198; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3199; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3200; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3201; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3202; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3203; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3204; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3205; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3206; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3207; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3208; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3209; 3210; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3211; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3212; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3213; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3214; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3215; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3216; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3217; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3218; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3219; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3220; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3221; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3222; GFX90A-TGSPLIT-NEXT: s_endpgm 3223; 3224; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3225; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3226; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3227; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3228; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3229; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3230; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3231; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3232; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3233; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3234; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3235; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3236; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3237; 3238; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3239; GFX940-TGSPLIT: ; %bb.0: ; %entry 3240; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3241; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3242; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3243; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3244; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3245; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3246; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3247; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3248; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3249; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3250; GFX940-TGSPLIT-NEXT: s_endpgm 3251; 3252; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3253; GFX11-WGP: ; %bb.0: ; %entry 3254; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3255; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3256; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3257; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3258; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3259; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3260; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3261; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3262; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3263; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3264; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3265; GFX11-WGP-NEXT: s_endpgm 3266; 3267; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3268; GFX11-CU: ; %bb.0: ; %entry 3269; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3270; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3271; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3272; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3273; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3274; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3275; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3276; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3277; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3278; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3279; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3280; GFX11-CU-NEXT: s_endpgm 3281; 3282; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3283; GFX12-WGP: ; %bb.0: ; %entry 3284; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3285; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3286; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3287; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3288; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3289; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3290; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3291; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3292; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3293; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3294; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3295; GFX12-WGP-NEXT: s_endpgm 3296; 3297; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: 3298; GFX12-CU: ; %bb.0: ; %entry 3299; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3300; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3301; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3302; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3303; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3304; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3305; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3306; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3307; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3308; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3309; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3310; GFX12-CU-NEXT: s_endpgm 3311 ptr %out, i32 %in, i32 %old) { 3312entry: 3313 %gep = getelementptr i32, ptr %out, i32 4 3314 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 3315 ret void 3316} 3317 3318define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( 3319; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3320; GFX7: ; %bb.0: ; %entry 3321; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3322; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3323; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3324; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3325; GFX7-NEXT: s_mov_b64 s[10:11], 16 3326; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3327; GFX7-NEXT: s_mov_b32 s4, s8 3328; GFX7-NEXT: s_mov_b32 s5, s9 3329; GFX7-NEXT: s_mov_b32 s9, s10 3330; GFX7-NEXT: s_mov_b32 s8, s11 3331; GFX7-NEXT: s_add_u32 s4, s4, s9 3332; GFX7-NEXT: s_addc_u32 s8, s5, s8 3333; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3334; GFX7-NEXT: s_mov_b32 s5, s8 3335; GFX7-NEXT: v_mov_b32_e32 v2, s7 3336; GFX7-NEXT: v_mov_b32_e32 v0, s6 3337; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3338; GFX7-NEXT: v_mov_b32_e32 v3, v0 3339; GFX7-NEXT: v_mov_b32_e32 v0, s4 3340; GFX7-NEXT: v_mov_b32_e32 v1, s5 3341; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3342; GFX7-NEXT: s_endpgm 3343; 3344; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3345; GFX10-WGP: ; %bb.0: ; %entry 3346; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3347; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3348; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3349; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3350; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3351; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3352; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3353; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3354; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3355; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3356; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3357; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3358; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3359; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3360; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3361; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3362; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3363; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3364; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3365; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3366; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3367; GFX10-WGP-NEXT: s_endpgm 3368; 3369; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3370; GFX10-CU: ; %bb.0: ; %entry 3371; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3372; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3373; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3374; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3375; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3376; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3377; GFX10-CU-NEXT: s_mov_b32 s4, s8 3378; GFX10-CU-NEXT: s_mov_b32 s5, s9 3379; GFX10-CU-NEXT: s_mov_b32 s9, s10 3380; GFX10-CU-NEXT: s_mov_b32 s8, s11 3381; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3382; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3383; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3384; GFX10-CU-NEXT: s_mov_b32 s5, s8 3385; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3386; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3387; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3388; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3389; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3390; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3391; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3392; GFX10-CU-NEXT: s_endpgm 3393; 3394; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3395; SKIP-CACHE-INV: ; %bb.0: ; %entry 3396; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3397; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3398; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3399; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3400; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3401; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3402; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3403; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3404; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3405; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3406; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3407; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3408; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3409; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3410; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3411; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3412; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3413; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3414; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3415; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3416; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3417; SKIP-CACHE-INV-NEXT: s_endpgm 3418; 3419; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3420; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3421; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3422; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3423; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3424; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3425; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3426; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3427; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3428; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3429; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3430; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3431; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3432; 3433; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3434; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3435; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3436; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3437; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3438; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3439; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3440; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3441; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3442; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3443; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3444; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3445; GFX90A-TGSPLIT-NEXT: s_endpgm 3446; 3447; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3448; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3449; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3450; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3451; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3452; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3453; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3454; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3455; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3456; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3457; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3458; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3459; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3460; 3461; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3462; GFX940-TGSPLIT: ; %bb.0: ; %entry 3463; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3464; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3465; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3466; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3467; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3468; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3469; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3470; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3471; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3472; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3473; GFX940-TGSPLIT-NEXT: s_endpgm 3474; 3475; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3476; GFX11-WGP: ; %bb.0: ; %entry 3477; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3478; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3479; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3480; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3481; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3482; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3483; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3484; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3485; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3486; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3487; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3488; GFX11-WGP-NEXT: s_endpgm 3489; 3490; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3491; GFX11-CU: ; %bb.0: ; %entry 3492; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3493; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3494; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3495; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3496; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3497; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3498; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3499; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3500; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3501; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3502; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3503; GFX11-CU-NEXT: s_endpgm 3504; 3505; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3506; GFX12-WGP: ; %bb.0: ; %entry 3507; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3508; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3509; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3510; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3511; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3512; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3513; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3514; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3515; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3516; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3517; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3518; GFX12-WGP-NEXT: s_endpgm 3519; 3520; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: 3521; GFX12-CU: ; %bb.0: ; %entry 3522; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3523; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3524; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3525; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3526; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3527; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3528; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3529; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3530; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3531; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3532; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3533; GFX12-CU-NEXT: s_endpgm 3534 ptr %out, i32 %in, i32 %old) { 3535entry: 3536 %gep = getelementptr i32, ptr %out, i32 4 3537 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 3538 ret void 3539} 3540 3541define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( 3542; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3543; GFX7: ; %bb.0: ; %entry 3544; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3545; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3546; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3547; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3548; GFX7-NEXT: s_mov_b64 s[10:11], 16 3549; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3550; GFX7-NEXT: s_mov_b32 s4, s8 3551; GFX7-NEXT: s_mov_b32 s5, s9 3552; GFX7-NEXT: s_mov_b32 s9, s10 3553; GFX7-NEXT: s_mov_b32 s8, s11 3554; GFX7-NEXT: s_add_u32 s4, s4, s9 3555; GFX7-NEXT: s_addc_u32 s8, s5, s8 3556; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3557; GFX7-NEXT: s_mov_b32 s5, s8 3558; GFX7-NEXT: v_mov_b32_e32 v2, s7 3559; GFX7-NEXT: v_mov_b32_e32 v0, s6 3560; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3561; GFX7-NEXT: v_mov_b32_e32 v3, v0 3562; GFX7-NEXT: v_mov_b32_e32 v0, s4 3563; GFX7-NEXT: v_mov_b32_e32 v1, s5 3564; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3565; GFX7-NEXT: s_endpgm 3566; 3567; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3568; GFX10-WGP: ; %bb.0: ; %entry 3569; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3570; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3571; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3572; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3573; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3574; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3575; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3576; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3577; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3578; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3579; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3580; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3581; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3582; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3583; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3584; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3585; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3586; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3587; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3588; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3589; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3590; GFX10-WGP-NEXT: s_endpgm 3591; 3592; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3593; GFX10-CU: ; %bb.0: ; %entry 3594; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3595; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3596; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3597; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3598; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3599; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3600; GFX10-CU-NEXT: s_mov_b32 s4, s8 3601; GFX10-CU-NEXT: s_mov_b32 s5, s9 3602; GFX10-CU-NEXT: s_mov_b32 s9, s10 3603; GFX10-CU-NEXT: s_mov_b32 s8, s11 3604; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3605; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3606; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3607; GFX10-CU-NEXT: s_mov_b32 s5, s8 3608; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3609; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3610; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3611; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3612; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3613; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3614; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3615; GFX10-CU-NEXT: s_endpgm 3616; 3617; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3618; SKIP-CACHE-INV: ; %bb.0: ; %entry 3619; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3620; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3621; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3622; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3623; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3624; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3625; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3626; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3627; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3628; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3629; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3630; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3631; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3632; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3633; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3634; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3635; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3636; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3637; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3638; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3639; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3640; SKIP-CACHE-INV-NEXT: s_endpgm 3641; 3642; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3643; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3644; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3645; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3646; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3647; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3648; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3649; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3650; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3651; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3652; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3653; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3654; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3655; 3656; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3657; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3658; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3659; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3660; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3661; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3662; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3663; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3664; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3665; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3666; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3667; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3668; GFX90A-TGSPLIT-NEXT: s_endpgm 3669; 3670; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3671; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3672; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3673; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3674; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3675; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3676; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3677; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3678; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3679; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3680; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3681; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3682; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3683; 3684; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3685; GFX940-TGSPLIT: ; %bb.0: ; %entry 3686; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3687; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3688; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3689; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3690; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3691; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3692; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3693; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3694; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3695; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3696; GFX940-TGSPLIT-NEXT: s_endpgm 3697; 3698; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3699; GFX11-WGP: ; %bb.0: ; %entry 3700; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3701; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3702; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3703; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3704; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3705; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3706; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3707; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3708; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3709; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3710; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3711; GFX11-WGP-NEXT: s_endpgm 3712; 3713; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3714; GFX11-CU: ; %bb.0: ; %entry 3715; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3716; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3717; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3718; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3719; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3720; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3721; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3722; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3723; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3724; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3725; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3726; GFX11-CU-NEXT: s_endpgm 3727; 3728; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3729; GFX12-WGP: ; %bb.0: ; %entry 3730; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3731; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3732; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3733; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3734; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3735; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3736; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3737; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3738; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3739; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3740; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3741; GFX12-WGP-NEXT: s_endpgm 3742; 3743; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: 3744; GFX12-CU: ; %bb.0: ; %entry 3745; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3746; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3747; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3748; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3749; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3750; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3751; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3752; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3753; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3754; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3755; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3756; GFX12-CU-NEXT: s_endpgm 3757 ptr %out, i32 %in, i32 %old) { 3758entry: 3759 %gep = getelementptr i32, ptr %out, i32 4 3760 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 3761 ret void 3762} 3763 3764define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( 3765; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3766; GFX7: ; %bb.0: ; %entry 3767; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3768; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3769; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3770; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3771; GFX7-NEXT: s_mov_b64 s[10:11], 16 3772; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3773; GFX7-NEXT: s_mov_b32 s4, s8 3774; GFX7-NEXT: s_mov_b32 s5, s9 3775; GFX7-NEXT: s_mov_b32 s9, s10 3776; GFX7-NEXT: s_mov_b32 s8, s11 3777; GFX7-NEXT: s_add_u32 s4, s4, s9 3778; GFX7-NEXT: s_addc_u32 s8, s5, s8 3779; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3780; GFX7-NEXT: s_mov_b32 s5, s8 3781; GFX7-NEXT: v_mov_b32_e32 v2, s7 3782; GFX7-NEXT: v_mov_b32_e32 v0, s6 3783; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3784; GFX7-NEXT: v_mov_b32_e32 v3, v0 3785; GFX7-NEXT: v_mov_b32_e32 v0, s4 3786; GFX7-NEXT: v_mov_b32_e32 v1, s5 3787; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3788; GFX7-NEXT: s_endpgm 3789; 3790; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3791; GFX10-WGP: ; %bb.0: ; %entry 3792; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 3793; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3794; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 3795; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 3796; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 3797; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3798; GFX10-WGP-NEXT: s_mov_b32 s4, s8 3799; GFX10-WGP-NEXT: s_mov_b32 s5, s9 3800; GFX10-WGP-NEXT: s_mov_b32 s9, s10 3801; GFX10-WGP-NEXT: s_mov_b32 s8, s11 3802; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 3803; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 3804; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3805; GFX10-WGP-NEXT: s_mov_b32 s5, s8 3806; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 3807; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 3808; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3809; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 3810; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 3811; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 3812; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3813; GFX10-WGP-NEXT: s_endpgm 3814; 3815; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3816; GFX10-CU: ; %bb.0: ; %entry 3817; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 3818; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3819; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 3820; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 3821; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 3822; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3823; GFX10-CU-NEXT: s_mov_b32 s4, s8 3824; GFX10-CU-NEXT: s_mov_b32 s5, s9 3825; GFX10-CU-NEXT: s_mov_b32 s9, s10 3826; GFX10-CU-NEXT: s_mov_b32 s8, s11 3827; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 3828; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 3829; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 3830; GFX10-CU-NEXT: s_mov_b32 s5, s8 3831; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 3832; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 3833; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3834; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 3835; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 3836; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 3837; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3838; GFX10-CU-NEXT: s_endpgm 3839; 3840; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3841; SKIP-CACHE-INV: ; %bb.0: ; %entry 3842; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 3843; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 3844; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 3845; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 3846; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 3847; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3848; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 3849; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 3850; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 3851; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 3852; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 3853; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 3854; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 3855; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 3856; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 3857; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3858; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3859; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 3860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3862; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3863; SKIP-CACHE-INV-NEXT: s_endpgm 3864; 3865; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3866; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 3867; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3868; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3869; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3870; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3871; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3872; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3873; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3874; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3875; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3876; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3877; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 3878; 3879; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3880; GFX90A-TGSPLIT: ; %bb.0: ; %entry 3881; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3882; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 3883; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 3884; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3885; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 3886; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 3887; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3888; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3889; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 3890; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3891; GFX90A-TGSPLIT-NEXT: s_endpgm 3892; 3893; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3894; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 3895; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3896; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3897; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3898; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3899; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3900; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3901; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3902; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3903; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3904; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3905; GFX940-NOTTGSPLIT-NEXT: s_endpgm 3906; 3907; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3908; GFX940-TGSPLIT: ; %bb.0: ; %entry 3909; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3910; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 3911; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 3912; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 3913; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 3914; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 3915; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3916; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 3917; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 3918; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 3919; GFX940-TGSPLIT-NEXT: s_endpgm 3920; 3921; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3922; GFX11-WGP: ; %bb.0: ; %entry 3923; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3924; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3925; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3926; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 3927; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 3928; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 3929; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3930; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 3931; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 3932; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 3933; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3934; GFX11-WGP-NEXT: s_endpgm 3935; 3936; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3937; GFX11-CU: ; %bb.0: ; %entry 3938; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3939; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3940; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3941; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 3942; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 3943; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 3944; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3945; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 3946; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 3947; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 3948; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3949; GFX11-CU-NEXT: s_endpgm 3950; 3951; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3952; GFX12-WGP: ; %bb.0: ; %entry 3953; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3954; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 3955; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 3956; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 3957; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 3958; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 3959; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3960; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 3961; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 3962; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 3963; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3964; GFX12-WGP-NEXT: s_endpgm 3965; 3966; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: 3967; GFX12-CU: ; %bb.0: ; %entry 3968; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3969; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 3970; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 3971; GFX12-CU-NEXT: s_wait_kmcnt 0x0 3972; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 3973; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 3974; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 3975; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 3976; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 3977; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 3978; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 3979; GFX12-CU-NEXT: s_endpgm 3980 ptr %out, i32 %in, i32 %old) { 3981entry: 3982 %gep = getelementptr i32, ptr %out, i32 4 3983 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 3984 ret void 3985} 3986 3987define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( 3988; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: 3989; GFX7: ; %bb.0: ; %entry 3990; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3991; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 3992; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 3993; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 3994; GFX7-NEXT: s_mov_b64 s[10:11], 16 3995; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3996; GFX7-NEXT: s_mov_b32 s4, s8 3997; GFX7-NEXT: s_mov_b32 s5, s9 3998; GFX7-NEXT: s_mov_b32 s9, s10 3999; GFX7-NEXT: s_mov_b32 s8, s11 4000; GFX7-NEXT: s_add_u32 s4, s4, s9 4001; GFX7-NEXT: s_addc_u32 s8, s5, s8 4002; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4003; GFX7-NEXT: s_mov_b32 s5, s8 4004; GFX7-NEXT: v_mov_b32_e32 v2, s7 4005; GFX7-NEXT: v_mov_b32_e32 v0, s6 4006; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4007; GFX7-NEXT: v_mov_b32_e32 v3, v0 4008; GFX7-NEXT: v_mov_b32_e32 v0, s4 4009; GFX7-NEXT: v_mov_b32_e32 v1, s5 4010; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4011; GFX7-NEXT: s_endpgm 4012; 4013; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 4014; GFX10-WGP: ; %bb.0: ; %entry 4015; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4016; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4017; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4018; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4019; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4020; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4021; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4022; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4023; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4024; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4025; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4026; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4027; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4028; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4029; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4030; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4031; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4032; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4033; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4034; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4035; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4036; GFX10-WGP-NEXT: s_endpgm 4037; 4038; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 4039; GFX10-CU: ; %bb.0: ; %entry 4040; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4041; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4042; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4043; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4044; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4045; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4046; GFX10-CU-NEXT: s_mov_b32 s4, s8 4047; GFX10-CU-NEXT: s_mov_b32 s5, s9 4048; GFX10-CU-NEXT: s_mov_b32 s9, s10 4049; GFX10-CU-NEXT: s_mov_b32 s8, s11 4050; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4051; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4052; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4053; GFX10-CU-NEXT: s_mov_b32 s5, s8 4054; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4055; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4056; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4057; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4058; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4059; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4060; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4061; GFX10-CU-NEXT: s_endpgm 4062; 4063; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: 4064; SKIP-CACHE-INV: ; %bb.0: ; %entry 4065; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4066; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4067; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4068; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4069; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4070; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4071; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4072; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4073; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4074; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4075; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4076; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4077; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4078; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4079; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4080; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4081; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4082; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4083; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4084; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4085; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4086; SKIP-CACHE-INV-NEXT: s_endpgm 4087; 4088; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 4089; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4090; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4091; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4092; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4093; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4094; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4095; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4096; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4097; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4098; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4099; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4100; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4101; 4102; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 4103; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4104; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4105; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4106; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4107; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4108; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4109; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4110; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4111; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4112; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4113; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4114; GFX90A-TGSPLIT-NEXT: s_endpgm 4115; 4116; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 4117; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4118; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4119; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4120; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4121; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4122; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4123; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4124; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4125; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4126; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4127; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4128; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4129; 4130; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: 4131; GFX940-TGSPLIT: ; %bb.0: ; %entry 4132; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4133; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4134; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4135; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4136; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4137; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4138; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4139; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4140; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4141; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4142; GFX940-TGSPLIT-NEXT: s_endpgm 4143; 4144; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 4145; GFX11-WGP: ; %bb.0: ; %entry 4146; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4147; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4148; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4149; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4150; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4151; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4152; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4153; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4154; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4155; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4156; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4157; GFX11-WGP-NEXT: s_endpgm 4158; 4159; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 4160; GFX11-CU: ; %bb.0: ; %entry 4161; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4162; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4163; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4164; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4165; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4166; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4167; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4168; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4169; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4170; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4171; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4172; GFX11-CU-NEXT: s_endpgm 4173; 4174; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: 4175; GFX12-WGP: ; %bb.0: ; %entry 4176; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4177; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4178; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4179; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4180; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4181; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4182; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4183; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4184; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4185; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4186; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4187; GFX12-WGP-NEXT: s_endpgm 4188; 4189; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: 4190; GFX12-CU: ; %bb.0: ; %entry 4191; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4192; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4193; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4194; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4195; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4196; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4197; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4198; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4199; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4200; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4201; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4202; GFX12-CU-NEXT: s_endpgm 4203 ptr %out, i32 %in, i32 %old) { 4204entry: 4205 %gep = getelementptr i32, ptr %out, i32 4 4206 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 4207 ret void 4208} 4209 4210define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( 4211; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4212; GFX7: ; %bb.0: ; %entry 4213; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4214; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4215; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4216; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4217; GFX7-NEXT: s_mov_b64 s[10:11], 16 4218; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4219; GFX7-NEXT: s_mov_b32 s4, s8 4220; GFX7-NEXT: s_mov_b32 s5, s9 4221; GFX7-NEXT: s_mov_b32 s9, s10 4222; GFX7-NEXT: s_mov_b32 s8, s11 4223; GFX7-NEXT: s_add_u32 s4, s4, s9 4224; GFX7-NEXT: s_addc_u32 s8, s5, s8 4225; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4226; GFX7-NEXT: s_mov_b32 s5, s8 4227; GFX7-NEXT: v_mov_b32_e32 v2, s7 4228; GFX7-NEXT: v_mov_b32_e32 v0, s6 4229; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4230; GFX7-NEXT: v_mov_b32_e32 v3, v0 4231; GFX7-NEXT: v_mov_b32_e32 v0, s4 4232; GFX7-NEXT: v_mov_b32_e32 v1, s5 4233; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4234; GFX7-NEXT: s_endpgm 4235; 4236; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4237; GFX10-WGP: ; %bb.0: ; %entry 4238; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4239; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4240; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4241; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4242; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4243; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4244; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4245; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4246; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4247; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4248; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4249; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4250; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4251; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4252; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4253; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4254; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4255; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4256; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4257; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4258; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4259; GFX10-WGP-NEXT: s_endpgm 4260; 4261; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4262; GFX10-CU: ; %bb.0: ; %entry 4263; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4264; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4265; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4266; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4267; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4268; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4269; GFX10-CU-NEXT: s_mov_b32 s4, s8 4270; GFX10-CU-NEXT: s_mov_b32 s5, s9 4271; GFX10-CU-NEXT: s_mov_b32 s9, s10 4272; GFX10-CU-NEXT: s_mov_b32 s8, s11 4273; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4274; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4275; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4276; GFX10-CU-NEXT: s_mov_b32 s5, s8 4277; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4278; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4279; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4280; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4281; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4282; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4283; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4284; GFX10-CU-NEXT: s_endpgm 4285; 4286; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4287; SKIP-CACHE-INV: ; %bb.0: ; %entry 4288; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4289; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4290; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4291; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4292; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4293; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4294; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4295; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4296; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4297; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4298; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4299; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4300; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4301; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4302; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4303; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4304; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4305; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4306; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4307; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4308; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4309; SKIP-CACHE-INV-NEXT: s_endpgm 4310; 4311; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4312; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4313; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4314; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4315; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4316; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4317; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4318; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4319; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4320; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4321; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4322; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4323; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4324; 4325; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4326; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4327; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4328; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4329; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4330; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4331; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4332; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4333; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4334; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4335; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4336; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4337; GFX90A-TGSPLIT-NEXT: s_endpgm 4338; 4339; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4340; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4341; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4342; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4343; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4344; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4345; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4346; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4347; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4348; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4349; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4350; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4351; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4352; 4353; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4354; GFX940-TGSPLIT: ; %bb.0: ; %entry 4355; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4356; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4357; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4358; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4359; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4360; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4361; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4362; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4363; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4364; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4365; GFX940-TGSPLIT-NEXT: s_endpgm 4366; 4367; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4368; GFX11-WGP: ; %bb.0: ; %entry 4369; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4370; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4371; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4372; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4373; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4374; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4375; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4376; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4377; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4378; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4379; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4380; GFX11-WGP-NEXT: s_endpgm 4381; 4382; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4383; GFX11-CU: ; %bb.0: ; %entry 4384; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4385; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4386; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4387; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4388; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4389; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4390; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4391; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4392; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4393; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4394; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4395; GFX11-CU-NEXT: s_endpgm 4396; 4397; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4398; GFX12-WGP: ; %bb.0: ; %entry 4399; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4400; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4401; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4402; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4403; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4404; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4405; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4406; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4407; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4408; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4409; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4410; GFX12-WGP-NEXT: s_endpgm 4411; 4412; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: 4413; GFX12-CU: ; %bb.0: ; %entry 4414; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4415; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4416; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4417; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4418; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4419; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4420; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4421; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4422; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4423; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4424; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4425; GFX12-CU-NEXT: s_endpgm 4426 ptr %out, i32 %in, i32 %old) { 4427entry: 4428 %gep = getelementptr i32, ptr %out, i32 4 4429 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 4430 ret void 4431} 4432 4433define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( 4434; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4435; GFX7: ; %bb.0: ; %entry 4436; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4437; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4438; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4439; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4440; GFX7-NEXT: s_mov_b64 s[10:11], 16 4441; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4442; GFX7-NEXT: s_mov_b32 s4, s8 4443; GFX7-NEXT: s_mov_b32 s5, s9 4444; GFX7-NEXT: s_mov_b32 s9, s10 4445; GFX7-NEXT: s_mov_b32 s8, s11 4446; GFX7-NEXT: s_add_u32 s4, s4, s9 4447; GFX7-NEXT: s_addc_u32 s8, s5, s8 4448; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4449; GFX7-NEXT: s_mov_b32 s5, s8 4450; GFX7-NEXT: v_mov_b32_e32 v2, s7 4451; GFX7-NEXT: v_mov_b32_e32 v0, s6 4452; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4453; GFX7-NEXT: v_mov_b32_e32 v3, v0 4454; GFX7-NEXT: v_mov_b32_e32 v0, s4 4455; GFX7-NEXT: v_mov_b32_e32 v1, s5 4456; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4457; GFX7-NEXT: s_endpgm 4458; 4459; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4460; GFX10-WGP: ; %bb.0: ; %entry 4461; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4462; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4463; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4464; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4465; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4466; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4467; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4468; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4469; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4470; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4471; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4472; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4473; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4474; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4475; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4476; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4477; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4478; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4479; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4480; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4481; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4482; GFX10-WGP-NEXT: s_endpgm 4483; 4484; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4485; GFX10-CU: ; %bb.0: ; %entry 4486; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4487; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4488; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4489; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4490; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4491; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4492; GFX10-CU-NEXT: s_mov_b32 s4, s8 4493; GFX10-CU-NEXT: s_mov_b32 s5, s9 4494; GFX10-CU-NEXT: s_mov_b32 s9, s10 4495; GFX10-CU-NEXT: s_mov_b32 s8, s11 4496; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4497; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4498; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4499; GFX10-CU-NEXT: s_mov_b32 s5, s8 4500; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4501; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4502; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4503; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4504; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4505; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4506; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4507; GFX10-CU-NEXT: s_endpgm 4508; 4509; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4510; SKIP-CACHE-INV: ; %bb.0: ; %entry 4511; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4512; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4513; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4514; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4515; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4516; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4517; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4518; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4519; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4520; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4521; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4522; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4523; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4524; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4525; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4526; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4527; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4528; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4529; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4530; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4531; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4532; SKIP-CACHE-INV-NEXT: s_endpgm 4533; 4534; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4535; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4536; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4537; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4538; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4539; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4540; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4541; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4542; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4543; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4544; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4545; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4546; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4547; 4548; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4549; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4550; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4551; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4552; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4553; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4554; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4555; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4556; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4557; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4558; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4559; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4560; GFX90A-TGSPLIT-NEXT: s_endpgm 4561; 4562; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4563; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4564; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4565; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4566; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4567; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4568; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4569; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4570; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4571; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4572; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4573; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4574; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4575; 4576; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4577; GFX940-TGSPLIT: ; %bb.0: ; %entry 4578; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4579; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4580; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4581; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4582; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4583; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4584; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4585; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4586; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4587; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4588; GFX940-TGSPLIT-NEXT: s_endpgm 4589; 4590; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4591; GFX11-WGP: ; %bb.0: ; %entry 4592; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4593; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4594; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4595; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4596; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4597; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4598; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4599; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4600; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4601; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4602; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4603; GFX11-WGP-NEXT: s_endpgm 4604; 4605; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4606; GFX11-CU: ; %bb.0: ; %entry 4607; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4608; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4609; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4610; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4611; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4612; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4613; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4614; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4615; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4616; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4617; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4618; GFX11-CU-NEXT: s_endpgm 4619; 4620; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4621; GFX12-WGP: ; %bb.0: ; %entry 4622; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4623; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4624; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4625; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4626; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4627; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4628; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4629; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4630; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4631; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4632; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4633; GFX12-WGP-NEXT: s_endpgm 4634; 4635; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: 4636; GFX12-CU: ; %bb.0: ; %entry 4637; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4638; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4639; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4640; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4641; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4642; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4643; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4644; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4645; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4646; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4647; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4648; GFX12-CU-NEXT: s_endpgm 4649 ptr %out, i32 %in, i32 %old) { 4650entry: 4651 %gep = getelementptr i32, ptr %out, i32 4 4652 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 4653 ret void 4654} 4655 4656define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( 4657; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4658; GFX7: ; %bb.0: ; %entry 4659; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4660; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4661; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4662; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4663; GFX7-NEXT: s_mov_b64 s[10:11], 16 4664; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4665; GFX7-NEXT: s_mov_b32 s4, s8 4666; GFX7-NEXT: s_mov_b32 s5, s9 4667; GFX7-NEXT: s_mov_b32 s9, s10 4668; GFX7-NEXT: s_mov_b32 s8, s11 4669; GFX7-NEXT: s_add_u32 s4, s4, s9 4670; GFX7-NEXT: s_addc_u32 s8, s5, s8 4671; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4672; GFX7-NEXT: s_mov_b32 s5, s8 4673; GFX7-NEXT: v_mov_b32_e32 v2, s7 4674; GFX7-NEXT: v_mov_b32_e32 v0, s6 4675; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4676; GFX7-NEXT: v_mov_b32_e32 v3, v0 4677; GFX7-NEXT: v_mov_b32_e32 v0, s4 4678; GFX7-NEXT: v_mov_b32_e32 v1, s5 4679; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4680; GFX7-NEXT: s_endpgm 4681; 4682; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4683; GFX10-WGP: ; %bb.0: ; %entry 4684; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4685; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4686; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4687; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4688; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4689; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4690; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4691; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4692; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4693; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4694; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4695; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4696; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4697; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4698; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4699; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4700; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4701; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4702; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4703; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4704; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4705; GFX10-WGP-NEXT: s_endpgm 4706; 4707; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4708; GFX10-CU: ; %bb.0: ; %entry 4709; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4710; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4711; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4712; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4713; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4714; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4715; GFX10-CU-NEXT: s_mov_b32 s4, s8 4716; GFX10-CU-NEXT: s_mov_b32 s5, s9 4717; GFX10-CU-NEXT: s_mov_b32 s9, s10 4718; GFX10-CU-NEXT: s_mov_b32 s8, s11 4719; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4720; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4721; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4722; GFX10-CU-NEXT: s_mov_b32 s5, s8 4723; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4724; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4725; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4726; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4727; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4728; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4729; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4730; GFX10-CU-NEXT: s_endpgm 4731; 4732; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4733; SKIP-CACHE-INV: ; %bb.0: ; %entry 4734; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4735; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4736; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4737; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4738; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4739; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4740; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4741; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4742; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4743; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4744; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4745; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4746; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4747; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4748; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4749; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4750; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4751; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4754; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4755; SKIP-CACHE-INV-NEXT: s_endpgm 4756; 4757; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4758; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4759; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4760; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4761; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4762; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4763; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4764; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4765; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4766; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4767; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4768; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4769; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4770; 4771; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4772; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4773; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4774; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4775; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4776; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4777; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4778; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4779; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4780; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4781; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4782; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4783; GFX90A-TGSPLIT-NEXT: s_endpgm 4784; 4785; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4786; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 4787; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4788; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4789; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4790; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4791; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4792; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4793; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4794; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4795; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4796; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4797; GFX940-NOTTGSPLIT-NEXT: s_endpgm 4798; 4799; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4800; GFX940-TGSPLIT: ; %bb.0: ; %entry 4801; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4802; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 4803; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 4804; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4805; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 4806; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 4807; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4808; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4809; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 4810; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4811; GFX940-TGSPLIT-NEXT: s_endpgm 4812; 4813; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4814; GFX11-WGP: ; %bb.0: ; %entry 4815; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4816; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4817; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4818; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 4819; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 4820; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 4821; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4822; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 4823; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 4824; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 4825; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4826; GFX11-WGP-NEXT: s_endpgm 4827; 4828; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4829; GFX11-CU: ; %bb.0: ; %entry 4830; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4831; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4832; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4833; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 4834; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 4835; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 4836; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4837; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 4838; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 4839; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 4840; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4841; GFX11-CU-NEXT: s_endpgm 4842; 4843; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4844; GFX12-WGP: ; %bb.0: ; %entry 4845; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4846; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 4847; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 4848; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 4849; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 4850; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 4851; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4852; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 4853; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 4854; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 4855; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4856; GFX12-WGP-NEXT: s_endpgm 4857; 4858; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: 4859; GFX12-CU: ; %bb.0: ; %entry 4860; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 4861; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 4862; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 4863; GFX12-CU-NEXT: s_wait_kmcnt 0x0 4864; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 4865; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 4866; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4867; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 4868; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 4869; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 4870; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 4871; GFX12-CU-NEXT: s_endpgm 4872 ptr %out, i32 %in, i32 %old) { 4873entry: 4874 %gep = getelementptr i32, ptr %out, i32 4 4875 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 4876 ret void 4877} 4878 4879define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( 4880; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4881; GFX7: ; %bb.0: ; %entry 4882; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4883; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4884; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 4885; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 4886; GFX7-NEXT: s_mov_b64 s[10:11], 16 4887; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4888; GFX7-NEXT: s_mov_b32 s4, s8 4889; GFX7-NEXT: s_mov_b32 s5, s9 4890; GFX7-NEXT: s_mov_b32 s9, s10 4891; GFX7-NEXT: s_mov_b32 s8, s11 4892; GFX7-NEXT: s_add_u32 s4, s4, s9 4893; GFX7-NEXT: s_addc_u32 s8, s5, s8 4894; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4895; GFX7-NEXT: s_mov_b32 s5, s8 4896; GFX7-NEXT: v_mov_b32_e32 v2, s7 4897; GFX7-NEXT: v_mov_b32_e32 v0, s6 4898; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4899; GFX7-NEXT: v_mov_b32_e32 v3, v0 4900; GFX7-NEXT: v_mov_b32_e32 v0, s4 4901; GFX7-NEXT: v_mov_b32_e32 v1, s5 4902; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4903; GFX7-NEXT: s_endpgm 4904; 4905; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4906; GFX10-WGP: ; %bb.0: ; %entry 4907; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 4908; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4909; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 4910; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 4911; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 4912; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4913; GFX10-WGP-NEXT: s_mov_b32 s4, s8 4914; GFX10-WGP-NEXT: s_mov_b32 s5, s9 4915; GFX10-WGP-NEXT: s_mov_b32 s9, s10 4916; GFX10-WGP-NEXT: s_mov_b32 s8, s11 4917; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 4918; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 4919; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4920; GFX10-WGP-NEXT: s_mov_b32 s5, s8 4921; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 4922; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 4923; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4924; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 4925; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4926; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4927; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4928; GFX10-WGP-NEXT: s_endpgm 4929; 4930; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4931; GFX10-CU: ; %bb.0: ; %entry 4932; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 4933; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 4934; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 4935; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 4936; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 4937; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4938; GFX10-CU-NEXT: s_mov_b32 s4, s8 4939; GFX10-CU-NEXT: s_mov_b32 s5, s9 4940; GFX10-CU-NEXT: s_mov_b32 s9, s10 4941; GFX10-CU-NEXT: s_mov_b32 s8, s11 4942; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 4943; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 4944; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 4945; GFX10-CU-NEXT: s_mov_b32 s5, s8 4946; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 4947; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 4948; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4949; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 4950; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4951; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4952; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4953; GFX10-CU-NEXT: s_endpgm 4954; 4955; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4956; SKIP-CACHE-INV: ; %bb.0: ; %entry 4957; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 4958; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 4959; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 4960; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 4961; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 4962; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4963; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 4964; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 4965; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 4966; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 4967; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 4968; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 4969; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 4970; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 4971; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 4972; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4973; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4974; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 4975; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4976; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4977; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4978; SKIP-CACHE-INV-NEXT: s_endpgm 4979; 4980; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4981; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 4982; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4983; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4984; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4985; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 4986; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 4987; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 4988; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 4989; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 4990; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 4991; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 4992; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 4993; 4994; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 4995; GFX90A-TGSPLIT: ; %bb.0: ; %entry 4996; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 4997; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 4998; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 4999; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5000; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5001; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5002; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5003; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5004; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5005; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5006; GFX90A-TGSPLIT-NEXT: s_endpgm 5007; 5008; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5009; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5010; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5011; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5012; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5013; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5014; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5015; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5016; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5017; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5018; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5019; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5020; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5021; 5022; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5023; GFX940-TGSPLIT: ; %bb.0: ; %entry 5024; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5025; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5026; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5027; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5028; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5029; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5030; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5031; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5032; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5033; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5034; GFX940-TGSPLIT-NEXT: s_endpgm 5035; 5036; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5037; GFX11-WGP: ; %bb.0: ; %entry 5038; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5039; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5040; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5041; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5042; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5043; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5044; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5045; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5046; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5047; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5048; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5049; GFX11-WGP-NEXT: s_endpgm 5050; 5051; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5052; GFX11-CU: ; %bb.0: ; %entry 5053; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5054; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5055; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5056; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5057; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5058; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5059; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5060; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5061; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5062; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5063; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5064; GFX11-CU-NEXT: s_endpgm 5065; 5066; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5067; GFX12-WGP: ; %bb.0: ; %entry 5068; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5069; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5070; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5071; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5072; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5073; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5074; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5075; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5076; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5077; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5078; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5079; GFX12-WGP-NEXT: s_endpgm 5080; 5081; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: 5082; GFX12-CU: ; %bb.0: ; %entry 5083; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5084; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5085; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5086; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5087; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5088; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5089; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5090; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5091; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5092; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5093; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5094; GFX12-CU-NEXT: s_endpgm 5095 ptr %out, i32 %in, i32 %old) { 5096entry: 5097 %gep = getelementptr i32, ptr %out, i32 4 5098 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 5099 ret void 5100} 5101 5102define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( 5103; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5104; GFX7: ; %bb.0: ; %entry 5105; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5106; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5107; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 5108; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 5109; GFX7-NEXT: s_mov_b64 s[10:11], 16 5110; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5111; GFX7-NEXT: s_mov_b32 s4, s8 5112; GFX7-NEXT: s_mov_b32 s5, s9 5113; GFX7-NEXT: s_mov_b32 s9, s10 5114; GFX7-NEXT: s_mov_b32 s8, s11 5115; GFX7-NEXT: s_add_u32 s4, s4, s9 5116; GFX7-NEXT: s_addc_u32 s8, s5, s8 5117; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5118; GFX7-NEXT: s_mov_b32 s5, s8 5119; GFX7-NEXT: v_mov_b32_e32 v2, s7 5120; GFX7-NEXT: v_mov_b32_e32 v0, s6 5121; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5122; GFX7-NEXT: v_mov_b32_e32 v3, v0 5123; GFX7-NEXT: v_mov_b32_e32 v0, s4 5124; GFX7-NEXT: v_mov_b32_e32 v1, s5 5125; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5126; GFX7-NEXT: s_endpgm 5127; 5128; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5129; GFX10-WGP: ; %bb.0: ; %entry 5130; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 5131; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5132; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 5133; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 5134; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 5135; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5136; GFX10-WGP-NEXT: s_mov_b32 s4, s8 5137; GFX10-WGP-NEXT: s_mov_b32 s5, s9 5138; GFX10-WGP-NEXT: s_mov_b32 s9, s10 5139; GFX10-WGP-NEXT: s_mov_b32 s8, s11 5140; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 5141; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 5142; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5143; GFX10-WGP-NEXT: s_mov_b32 s5, s8 5144; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 5145; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5146; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5147; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5148; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5149; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5150; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5151; GFX10-WGP-NEXT: s_endpgm 5152; 5153; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5154; GFX10-CU: ; %bb.0: ; %entry 5155; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 5156; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5157; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 5158; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 5159; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 5160; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5161; GFX10-CU-NEXT: s_mov_b32 s4, s8 5162; GFX10-CU-NEXT: s_mov_b32 s5, s9 5163; GFX10-CU-NEXT: s_mov_b32 s9, s10 5164; GFX10-CU-NEXT: s_mov_b32 s8, s11 5165; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 5166; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 5167; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5168; GFX10-CU-NEXT: s_mov_b32 s5, s8 5169; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 5170; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5171; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5172; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5173; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5174; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5175; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5176; GFX10-CU-NEXT: s_endpgm 5177; 5178; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5179; SKIP-CACHE-INV: ; %bb.0: ; %entry 5180; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 5181; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 5182; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 5183; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 5184; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 5185; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5186; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 5187; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 5188; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 5189; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 5190; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 5191; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 5192; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 5193; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 5194; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5195; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5196; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5197; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5198; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5199; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5200; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5201; SKIP-CACHE-INV-NEXT: s_endpgm 5202; 5203; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5204; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5205; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5206; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5207; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5208; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5209; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5210; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5211; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5212; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5213; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5214; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5215; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5216; 5217; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5218; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5219; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5220; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5221; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5222; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5223; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5224; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5225; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5226; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5227; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5228; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5229; GFX90A-TGSPLIT-NEXT: s_endpgm 5230; 5231; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5232; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5233; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5234; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5235; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5236; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5237; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5238; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5239; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5240; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5241; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5242; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5243; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5244; 5245; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5246; GFX940-TGSPLIT: ; %bb.0: ; %entry 5247; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5248; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5249; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5250; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5251; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5252; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5253; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5254; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5255; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5256; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5257; GFX940-TGSPLIT-NEXT: s_endpgm 5258; 5259; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5260; GFX11-WGP: ; %bb.0: ; %entry 5261; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5262; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5263; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5264; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5265; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5266; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5267; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5268; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5269; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5270; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5271; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5272; GFX11-WGP-NEXT: s_endpgm 5273; 5274; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5275; GFX11-CU: ; %bb.0: ; %entry 5276; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5277; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5278; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5279; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5280; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5281; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5282; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5283; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5284; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5285; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5286; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5287; GFX11-CU-NEXT: s_endpgm 5288; 5289; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5290; GFX12-WGP: ; %bb.0: ; %entry 5291; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5292; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5293; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5294; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5295; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5296; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5297; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5298; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5299; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5300; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5301; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5302; GFX12-WGP-NEXT: s_endpgm 5303; 5304; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: 5305; GFX12-CU: ; %bb.0: ; %entry 5306; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5307; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5308; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5309; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5310; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5311; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5312; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5313; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5314; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5315; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5316; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5317; GFX12-CU-NEXT: s_endpgm 5318 ptr %out, i32 %in, i32 %old) { 5319entry: 5320 %gep = getelementptr i32, ptr %out, i32 4 5321 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 5322 ret void 5323} 5324 5325define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( 5326; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5327; GFX7: ; %bb.0: ; %entry 5328; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5329; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5330; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 5331; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 5332; GFX7-NEXT: s_mov_b64 s[10:11], 16 5333; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5334; GFX7-NEXT: s_mov_b32 s4, s8 5335; GFX7-NEXT: s_mov_b32 s5, s9 5336; GFX7-NEXT: s_mov_b32 s9, s10 5337; GFX7-NEXT: s_mov_b32 s8, s11 5338; GFX7-NEXT: s_add_u32 s4, s4, s9 5339; GFX7-NEXT: s_addc_u32 s8, s5, s8 5340; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5341; GFX7-NEXT: s_mov_b32 s5, s8 5342; GFX7-NEXT: v_mov_b32_e32 v2, s7 5343; GFX7-NEXT: v_mov_b32_e32 v0, s6 5344; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5345; GFX7-NEXT: v_mov_b32_e32 v3, v0 5346; GFX7-NEXT: v_mov_b32_e32 v0, s4 5347; GFX7-NEXT: v_mov_b32_e32 v1, s5 5348; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5349; GFX7-NEXT: s_endpgm 5350; 5351; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5352; GFX10-WGP: ; %bb.0: ; %entry 5353; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 5354; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5355; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 5356; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 5357; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 5358; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5359; GFX10-WGP-NEXT: s_mov_b32 s4, s8 5360; GFX10-WGP-NEXT: s_mov_b32 s5, s9 5361; GFX10-WGP-NEXT: s_mov_b32 s9, s10 5362; GFX10-WGP-NEXT: s_mov_b32 s8, s11 5363; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 5364; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 5365; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5366; GFX10-WGP-NEXT: s_mov_b32 s5, s8 5367; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 5368; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5369; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5370; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5371; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5372; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5373; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5374; GFX10-WGP-NEXT: s_endpgm 5375; 5376; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5377; GFX10-CU: ; %bb.0: ; %entry 5378; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 5379; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5380; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 5381; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 5382; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 5383; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5384; GFX10-CU-NEXT: s_mov_b32 s4, s8 5385; GFX10-CU-NEXT: s_mov_b32 s5, s9 5386; GFX10-CU-NEXT: s_mov_b32 s9, s10 5387; GFX10-CU-NEXT: s_mov_b32 s8, s11 5388; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 5389; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 5390; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5391; GFX10-CU-NEXT: s_mov_b32 s5, s8 5392; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 5393; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5394; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5395; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5396; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5397; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5398; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5399; GFX10-CU-NEXT: s_endpgm 5400; 5401; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5402; SKIP-CACHE-INV: ; %bb.0: ; %entry 5403; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 5404; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 5405; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 5406; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 5407; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 5408; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5409; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 5410; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 5411; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 5412; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 5413; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 5414; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 5415; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 5416; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 5417; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5418; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5419; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5420; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5421; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5422; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5423; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5424; SKIP-CACHE-INV-NEXT: s_endpgm 5425; 5426; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5427; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5428; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5429; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5430; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5431; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5432; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5433; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5434; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5435; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5436; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5437; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5438; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5439; 5440; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5441; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5442; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5443; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5444; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5445; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5446; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5447; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5448; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5449; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5450; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5451; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5452; GFX90A-TGSPLIT-NEXT: s_endpgm 5453; 5454; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5455; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5456; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5457; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5458; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5459; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5460; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5461; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5462; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5463; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5464; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5465; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5466; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5467; 5468; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5469; GFX940-TGSPLIT: ; %bb.0: ; %entry 5470; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5471; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5472; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5473; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5474; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5475; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5476; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5477; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5478; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5479; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5480; GFX940-TGSPLIT-NEXT: s_endpgm 5481; 5482; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5483; GFX11-WGP: ; %bb.0: ; %entry 5484; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5485; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5486; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5487; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5488; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5489; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5490; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5491; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5492; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5493; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5494; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5495; GFX11-WGP-NEXT: s_endpgm 5496; 5497; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5498; GFX11-CU: ; %bb.0: ; %entry 5499; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5500; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5501; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5502; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5503; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5504; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5505; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5506; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5507; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5508; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5509; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5510; GFX11-CU-NEXT: s_endpgm 5511; 5512; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5513; GFX12-WGP: ; %bb.0: ; %entry 5514; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5515; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5516; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5517; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5518; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5519; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5520; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5521; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5522; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5523; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5524; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5525; GFX12-WGP-NEXT: s_endpgm 5526; 5527; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: 5528; GFX12-CU: ; %bb.0: ; %entry 5529; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5530; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5531; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5532; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5533; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5534; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5535; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5536; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5537; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5538; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5539; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5540; GFX12-CU-NEXT: s_endpgm 5541 ptr %out, i32 %in, i32 %old) { 5542entry: 5543 %gep = getelementptr i32, ptr %out, i32 4 5544 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 5545 ret void 5546} 5547 5548define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( 5549; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5550; GFX7: ; %bb.0: ; %entry 5551; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5552; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5553; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 5554; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 5555; GFX7-NEXT: s_mov_b64 s[10:11], 16 5556; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5557; GFX7-NEXT: s_mov_b32 s4, s8 5558; GFX7-NEXT: s_mov_b32 s5, s9 5559; GFX7-NEXT: s_mov_b32 s9, s10 5560; GFX7-NEXT: s_mov_b32 s8, s11 5561; GFX7-NEXT: s_add_u32 s4, s4, s9 5562; GFX7-NEXT: s_addc_u32 s8, s5, s8 5563; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5564; GFX7-NEXT: s_mov_b32 s5, s8 5565; GFX7-NEXT: v_mov_b32_e32 v2, s7 5566; GFX7-NEXT: v_mov_b32_e32 v0, s6 5567; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5568; GFX7-NEXT: v_mov_b32_e32 v3, v0 5569; GFX7-NEXT: v_mov_b32_e32 v0, s4 5570; GFX7-NEXT: v_mov_b32_e32 v1, s5 5571; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5572; GFX7-NEXT: s_endpgm 5573; 5574; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5575; GFX10-WGP: ; %bb.0: ; %entry 5576; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 5577; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5578; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 5579; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 5580; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 5581; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5582; GFX10-WGP-NEXT: s_mov_b32 s4, s8 5583; GFX10-WGP-NEXT: s_mov_b32 s5, s9 5584; GFX10-WGP-NEXT: s_mov_b32 s9, s10 5585; GFX10-WGP-NEXT: s_mov_b32 s8, s11 5586; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 5587; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 5588; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5589; GFX10-WGP-NEXT: s_mov_b32 s5, s8 5590; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 5591; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5592; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5593; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5594; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5595; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5596; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5597; GFX10-WGP-NEXT: s_endpgm 5598; 5599; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5600; GFX10-CU: ; %bb.0: ; %entry 5601; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 5602; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 5603; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 5604; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 5605; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 5606; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5607; GFX10-CU-NEXT: s_mov_b32 s4, s8 5608; GFX10-CU-NEXT: s_mov_b32 s5, s9 5609; GFX10-CU-NEXT: s_mov_b32 s9, s10 5610; GFX10-CU-NEXT: s_mov_b32 s8, s11 5611; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 5612; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 5613; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 5614; GFX10-CU-NEXT: s_mov_b32 s5, s8 5615; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 5616; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5617; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5618; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5619; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5620; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5621; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5622; GFX10-CU-NEXT: s_endpgm 5623; 5624; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5625; SKIP-CACHE-INV: ; %bb.0: ; %entry 5626; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 5627; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 5628; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 5629; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 5630; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 5631; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5632; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 5633; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 5634; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 5635; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 5636; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 5637; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 5638; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 5639; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 5640; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 5641; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5642; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5643; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5646; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 5647; SKIP-CACHE-INV-NEXT: s_endpgm 5648; 5649; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5650; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5651; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5652; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5653; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5654; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5655; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5656; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5657; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5658; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5659; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5660; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5661; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5662; 5663; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5664; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5665; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5666; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5667; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5668; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5669; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5670; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5671; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5672; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5673; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5674; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5675; GFX90A-TGSPLIT-NEXT: s_endpgm 5676; 5677; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5678; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5679; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5680; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5681; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5682; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5683; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5684; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5685; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5686; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5687; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5688; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5689; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5690; 5691; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5692; GFX940-TGSPLIT: ; %bb.0: ; %entry 5693; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5694; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5695; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5696; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5697; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5698; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5699; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5700; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5701; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5702; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 5703; GFX940-TGSPLIT-NEXT: s_endpgm 5704; 5705; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5706; GFX11-WGP: ; %bb.0: ; %entry 5707; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5708; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5709; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5710; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5711; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5712; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5713; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5714; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5715; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5716; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5717; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5718; GFX11-WGP-NEXT: s_endpgm 5719; 5720; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5721; GFX11-CU: ; %bb.0: ; %entry 5722; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5723; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5724; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5725; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5726; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5727; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5728; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5729; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5730; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5731; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5732; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5733; GFX11-CU-NEXT: s_endpgm 5734; 5735; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5736; GFX12-WGP: ; %bb.0: ; %entry 5737; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5738; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5739; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5740; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 5741; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 5742; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 5743; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5744; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 5745; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 5746; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 5747; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5748; GFX12-WGP-NEXT: s_endpgm 5749; 5750; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: 5751; GFX12-CU: ; %bb.0: ; %entry 5752; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5753; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5754; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5755; GFX12-CU-NEXT: s_wait_kmcnt 0x0 5756; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 5757; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 5758; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5759; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 5760; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 5761; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 5762; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 5763; GFX12-CU-NEXT: s_endpgm 5764 ptr %out, i32 %in, i32 %old) { 5765entry: 5766 %gep = getelementptr i32, ptr %out, i32 4 5767 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 5768 ret void 5769} 5770 5771define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( 5772; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5773; GFX7: ; %bb.0: ; %entry 5774; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 5775; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5776; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 5777; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 5778; GFX7-NEXT: s_mov_b64 s[12:13], 16 5779; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5780; GFX7-NEXT: s_mov_b32 s6, s4 5781; GFX7-NEXT: s_mov_b32 s7, s5 5782; GFX7-NEXT: s_mov_b32 s11, s12 5783; GFX7-NEXT: s_mov_b32 s10, s13 5784; GFX7-NEXT: s_add_u32 s6, s6, s11 5785; GFX7-NEXT: s_addc_u32 s10, s7, s10 5786; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5787; GFX7-NEXT: s_mov_b32 s7, s10 5788; GFX7-NEXT: v_mov_b32_e32 v2, s9 5789; GFX7-NEXT: v_mov_b32_e32 v0, s8 5790; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5791; GFX7-NEXT: v_mov_b32_e32 v3, v0 5792; GFX7-NEXT: v_mov_b32_e32 v0, s6 5793; GFX7-NEXT: v_mov_b32_e32 v1, s7 5794; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5795; GFX7-NEXT: v_mov_b32_e32 v0, s4 5796; GFX7-NEXT: v_mov_b32_e32 v1, s5 5797; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5798; GFX7-NEXT: flat_store_dword v[0:1], v2 5799; GFX7-NEXT: s_endpgm 5800; 5801; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5802; GFX10-WGP: ; %bb.0: ; %entry 5803; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 5804; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5805; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 5806; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 5807; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 5808; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5809; GFX10-WGP-NEXT: s_mov_b32 s6, s4 5810; GFX10-WGP-NEXT: s_mov_b32 s7, s5 5811; GFX10-WGP-NEXT: s_mov_b32 s11, s12 5812; GFX10-WGP-NEXT: s_mov_b32 s10, s13 5813; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 5814; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 5815; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5816; GFX10-WGP-NEXT: s_mov_b32 s7, s10 5817; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 5818; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 5819; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5820; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 5821; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 5822; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 5823; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5824; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5825; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5826; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5827; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5828; GFX10-WGP-NEXT: s_endpgm 5829; 5830; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5831; GFX10-CU: ; %bb.0: ; %entry 5832; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 5833; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 5834; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 5835; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 5836; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 5837; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5838; GFX10-CU-NEXT: s_mov_b32 s6, s4 5839; GFX10-CU-NEXT: s_mov_b32 s7, s5 5840; GFX10-CU-NEXT: s_mov_b32 s11, s12 5841; GFX10-CU-NEXT: s_mov_b32 s10, s13 5842; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 5843; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 5844; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 5845; GFX10-CU-NEXT: s_mov_b32 s7, s10 5846; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 5847; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 5848; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5849; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 5850; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 5851; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 5852; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5853; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5854; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5855; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5856; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5857; GFX10-CU-NEXT: s_endpgm 5858; 5859; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5860; SKIP-CACHE-INV: ; %bb.0: ; %entry 5861; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 5862; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 5863; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 5864; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 5865; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 5866; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5867; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 5868; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 5869; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 5870; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 5871; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 5872; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 5873; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 5874; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 5875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 5876; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5877; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5878; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 5879; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5880; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5881; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5882; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5883; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5884; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5885; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5886; SKIP-CACHE-INV-NEXT: s_endpgm 5887; 5888; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5889; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 5890; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5891; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5892; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5893; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5894; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5895; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5896; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5897; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5898; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5899; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5900; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5901; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5902; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 5903; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 5904; 5905; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5906; GFX90A-TGSPLIT: ; %bb.0: ; %entry 5907; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 5908; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 5909; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 5910; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5911; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 5912; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 5913; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5914; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5915; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5916; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 5917; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 5918; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5919; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 5920; GFX90A-TGSPLIT-NEXT: s_endpgm 5921; 5922; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5923; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 5924; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5925; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5926; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5927; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5928; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5929; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5930; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5931; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5932; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5933; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5934; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5935; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5936; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5937; GFX940-NOTTGSPLIT-NEXT: s_endpgm 5938; 5939; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5940; GFX940-TGSPLIT: ; %bb.0: ; %entry 5941; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5942; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 5943; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 5944; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 5945; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 5946; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 5947; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5948; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 5949; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5950; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 5951; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 5952; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 5953; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 5954; GFX940-TGSPLIT-NEXT: s_endpgm 5955; 5956; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5957; GFX11-WGP: ; %bb.0: ; %entry 5958; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5959; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5960; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5961; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 5962; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 5963; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 5964; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5965; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 5966; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5967; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5968; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5969; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 5970; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 5971; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5972; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 5973; GFX11-WGP-NEXT: s_endpgm 5974; 5975; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5976; GFX11-CU: ; %bb.0: ; %entry 5977; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5978; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 5979; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 5980; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 5981; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 5982; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 5983; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 5984; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 5985; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5986; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5987; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 5988; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 5989; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 5990; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5991; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 5992; GFX11-CU-NEXT: s_endpgm 5993; 5994; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 5995; GFX12-WGP: ; %bb.0: ; %entry 5996; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 5997; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 5998; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 5999; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6000; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6001; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6002; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6003; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6004; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6005; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6006; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6007; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6008; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6009; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6010; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6011; GFX12-WGP-NEXT: s_endpgm 6012; 6013; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: 6014; GFX12-CU: ; %bb.0: ; %entry 6015; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6016; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6017; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6018; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6019; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6020; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6021; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6022; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6023; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6024; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6025; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6026; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6027; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6028; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 6029; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6030; GFX12-CU-NEXT: s_endpgm 6031 ptr %out, i32 %in, i32 %old) { 6032entry: 6033 %gep = getelementptr i32, ptr %out, i32 4 6034 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic 6035 %val0 = extractvalue { i32, i1 } %val, 0 6036 store i32 %val0, ptr %out, align 4 6037 ret void 6038} 6039 6040define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( 6041; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6042; GFX7: ; %bb.0: ; %entry 6043; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6044; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6045; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6046; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6047; GFX7-NEXT: s_mov_b64 s[12:13], 16 6048; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6049; GFX7-NEXT: s_mov_b32 s6, s4 6050; GFX7-NEXT: s_mov_b32 s7, s5 6051; GFX7-NEXT: s_mov_b32 s11, s12 6052; GFX7-NEXT: s_mov_b32 s10, s13 6053; GFX7-NEXT: s_add_u32 s6, s6, s11 6054; GFX7-NEXT: s_addc_u32 s10, s7, s10 6055; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6056; GFX7-NEXT: s_mov_b32 s7, s10 6057; GFX7-NEXT: v_mov_b32_e32 v2, s9 6058; GFX7-NEXT: v_mov_b32_e32 v0, s8 6059; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6060; GFX7-NEXT: v_mov_b32_e32 v3, v0 6061; GFX7-NEXT: v_mov_b32_e32 v0, s6 6062; GFX7-NEXT: v_mov_b32_e32 v1, s7 6063; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6064; GFX7-NEXT: v_mov_b32_e32 v0, s4 6065; GFX7-NEXT: v_mov_b32_e32 v1, s5 6066; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6067; GFX7-NEXT: flat_store_dword v[0:1], v2 6068; GFX7-NEXT: s_endpgm 6069; 6070; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6071; GFX10-WGP: ; %bb.0: ; %entry 6072; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6073; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6074; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6075; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6076; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6077; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6078; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6079; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6080; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6081; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6082; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6083; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6084; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6085; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6086; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6087; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6088; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6089; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6090; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6091; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6092; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6093; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6094; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6095; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6096; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6097; GFX10-WGP-NEXT: s_endpgm 6098; 6099; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6100; GFX10-CU: ; %bb.0: ; %entry 6101; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6102; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6103; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6104; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6105; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6106; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6107; GFX10-CU-NEXT: s_mov_b32 s6, s4 6108; GFX10-CU-NEXT: s_mov_b32 s7, s5 6109; GFX10-CU-NEXT: s_mov_b32 s11, s12 6110; GFX10-CU-NEXT: s_mov_b32 s10, s13 6111; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6112; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6113; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6114; GFX10-CU-NEXT: s_mov_b32 s7, s10 6115; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6116; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6117; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6118; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6119; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6120; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6121; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6122; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6123; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6124; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6125; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6126; GFX10-CU-NEXT: s_endpgm 6127; 6128; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6129; SKIP-CACHE-INV: ; %bb.0: ; %entry 6130; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6131; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6132; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6133; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6134; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6135; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6136; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6137; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6138; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6139; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6140; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6141; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6142; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6143; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6144; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6145; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6146; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6147; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6148; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6150; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6151; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6152; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6153; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6154; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6155; SKIP-CACHE-INV-NEXT: s_endpgm 6156; 6157; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6158; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6159; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6160; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6161; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6162; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6163; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6164; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6165; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6166; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6167; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6168; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6169; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6170; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6171; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6172; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6173; 6174; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6175; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6176; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6177; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6178; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6179; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6180; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6181; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6182; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6183; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6184; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6185; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6186; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6187; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6188; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6189; GFX90A-TGSPLIT-NEXT: s_endpgm 6190; 6191; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6192; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6193; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6194; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6195; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6196; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6197; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6198; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6199; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6200; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6201; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6202; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6203; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6204; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6205; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6206; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6207; 6208; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6209; GFX940-TGSPLIT: ; %bb.0: ; %entry 6210; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6211; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6212; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6213; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6214; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6215; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6216; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6217; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6218; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6219; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6220; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6221; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6222; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6223; GFX940-TGSPLIT-NEXT: s_endpgm 6224; 6225; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6226; GFX11-WGP: ; %bb.0: ; %entry 6227; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6228; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6229; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6230; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6231; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6232; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6233; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6234; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6235; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6236; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6237; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6238; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6239; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6240; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6241; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6242; GFX11-WGP-NEXT: s_endpgm 6243; 6244; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6245; GFX11-CU: ; %bb.0: ; %entry 6246; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6247; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6248; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6249; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6250; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6251; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6252; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6253; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6254; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6255; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6256; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6257; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6258; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6259; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6260; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6261; GFX11-CU-NEXT: s_endpgm 6262; 6263; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6264; GFX12-WGP: ; %bb.0: ; %entry 6265; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6266; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6267; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6268; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6269; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6270; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6271; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6272; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6273; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6274; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6275; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6276; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6277; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6278; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6279; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6280; GFX12-WGP-NEXT: s_endpgm 6281; 6282; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: 6283; GFX12-CU: ; %bb.0: ; %entry 6284; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6285; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6286; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6287; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6288; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6289; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6290; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6291; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6292; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6293; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6294; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6295; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6296; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6297; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 6298; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6299; GFX12-CU-NEXT: s_endpgm 6300 ptr %out, i32 %in, i32 %old) { 6301entry: 6302 %gep = getelementptr i32, ptr %out, i32 4 6303 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic 6304 %val0 = extractvalue { i32, i1 } %val, 0 6305 store i32 %val0, ptr %out, align 4 6306 ret void 6307} 6308 6309define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( 6310; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6311; GFX7: ; %bb.0: ; %entry 6312; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6313; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6314; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6315; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6316; GFX7-NEXT: s_mov_b64 s[12:13], 16 6317; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6318; GFX7-NEXT: s_mov_b32 s6, s4 6319; GFX7-NEXT: s_mov_b32 s7, s5 6320; GFX7-NEXT: s_mov_b32 s11, s12 6321; GFX7-NEXT: s_mov_b32 s10, s13 6322; GFX7-NEXT: s_add_u32 s6, s6, s11 6323; GFX7-NEXT: s_addc_u32 s10, s7, s10 6324; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6325; GFX7-NEXT: s_mov_b32 s7, s10 6326; GFX7-NEXT: v_mov_b32_e32 v2, s9 6327; GFX7-NEXT: v_mov_b32_e32 v0, s8 6328; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6329; GFX7-NEXT: v_mov_b32_e32 v3, v0 6330; GFX7-NEXT: v_mov_b32_e32 v0, s6 6331; GFX7-NEXT: v_mov_b32_e32 v1, s7 6332; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6333; GFX7-NEXT: v_mov_b32_e32 v0, s4 6334; GFX7-NEXT: v_mov_b32_e32 v1, s5 6335; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6336; GFX7-NEXT: flat_store_dword v[0:1], v2 6337; GFX7-NEXT: s_endpgm 6338; 6339; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6340; GFX10-WGP: ; %bb.0: ; %entry 6341; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6342; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6343; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6344; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6345; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6346; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6347; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6348; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6349; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6350; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6351; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6352; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6353; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6354; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6355; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6356; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6357; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6358; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6359; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6360; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6361; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6362; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6363; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6364; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6365; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6366; GFX10-WGP-NEXT: s_endpgm 6367; 6368; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6369; GFX10-CU: ; %bb.0: ; %entry 6370; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6371; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6372; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6373; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6374; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6375; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6376; GFX10-CU-NEXT: s_mov_b32 s6, s4 6377; GFX10-CU-NEXT: s_mov_b32 s7, s5 6378; GFX10-CU-NEXT: s_mov_b32 s11, s12 6379; GFX10-CU-NEXT: s_mov_b32 s10, s13 6380; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6381; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6382; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6383; GFX10-CU-NEXT: s_mov_b32 s7, s10 6384; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6385; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6386; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6387; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6388; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6389; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6390; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6391; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6392; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6393; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6394; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6395; GFX10-CU-NEXT: s_endpgm 6396; 6397; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6398; SKIP-CACHE-INV: ; %bb.0: ; %entry 6399; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6400; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6401; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6402; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6403; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6404; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6405; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6406; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6407; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6408; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6409; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6410; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6411; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6412; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6413; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6414; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6415; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6416; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6417; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6418; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6419; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6420; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6421; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6422; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6423; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6424; SKIP-CACHE-INV-NEXT: s_endpgm 6425; 6426; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6427; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6428; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6429; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6430; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6431; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6432; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6433; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6434; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6435; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6436; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6437; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6438; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6439; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6440; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6441; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6442; 6443; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6444; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6445; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6446; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6447; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6448; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6449; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6450; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6451; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6452; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6453; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6454; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6455; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6456; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6457; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6458; GFX90A-TGSPLIT-NEXT: s_endpgm 6459; 6460; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6461; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6462; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6463; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6464; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6465; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6466; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6467; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6468; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6469; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6470; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6471; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6472; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6473; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6474; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6475; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6476; 6477; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6478; GFX940-TGSPLIT: ; %bb.0: ; %entry 6479; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6480; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6481; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6482; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6483; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6484; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6485; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6486; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6487; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6488; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6489; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6490; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6491; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6492; GFX940-TGSPLIT-NEXT: s_endpgm 6493; 6494; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6495; GFX11-WGP: ; %bb.0: ; %entry 6496; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6497; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6498; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6499; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6500; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6501; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6502; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6503; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6504; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6505; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6506; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6507; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6508; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6509; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6510; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6511; GFX11-WGP-NEXT: s_endpgm 6512; 6513; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6514; GFX11-CU: ; %bb.0: ; %entry 6515; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6516; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6517; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6518; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6519; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6520; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6521; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6522; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6523; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6524; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6525; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6526; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6527; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6528; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6529; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6530; GFX11-CU-NEXT: s_endpgm 6531; 6532; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6533; GFX12-WGP: ; %bb.0: ; %entry 6534; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6535; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6536; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6537; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6538; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6539; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6540; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6541; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6542; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6543; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6544; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6545; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6546; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6547; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6548; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6549; GFX12-WGP-NEXT: s_endpgm 6550; 6551; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: 6552; GFX12-CU: ; %bb.0: ; %entry 6553; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6554; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6555; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6556; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6557; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6558; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6559; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6560; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6561; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6562; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6563; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6564; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6565; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6566; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 6567; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6568; GFX12-CU-NEXT: s_endpgm 6569 ptr %out, i32 %in, i32 %old) { 6570entry: 6571 %gep = getelementptr i32, ptr %out, i32 4 6572 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic 6573 %val0 = extractvalue { i32, i1 } %val, 0 6574 store i32 %val0, ptr %out, align 4 6575 ret void 6576} 6577 6578define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( 6579; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6580; GFX7: ; %bb.0: ; %entry 6581; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6582; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6583; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6584; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6585; GFX7-NEXT: s_mov_b64 s[12:13], 16 6586; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6587; GFX7-NEXT: s_mov_b32 s6, s4 6588; GFX7-NEXT: s_mov_b32 s7, s5 6589; GFX7-NEXT: s_mov_b32 s11, s12 6590; GFX7-NEXT: s_mov_b32 s10, s13 6591; GFX7-NEXT: s_add_u32 s6, s6, s11 6592; GFX7-NEXT: s_addc_u32 s10, s7, s10 6593; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6594; GFX7-NEXT: s_mov_b32 s7, s10 6595; GFX7-NEXT: v_mov_b32_e32 v2, s9 6596; GFX7-NEXT: v_mov_b32_e32 v0, s8 6597; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6598; GFX7-NEXT: v_mov_b32_e32 v3, v0 6599; GFX7-NEXT: v_mov_b32_e32 v0, s6 6600; GFX7-NEXT: v_mov_b32_e32 v1, s7 6601; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6602; GFX7-NEXT: v_mov_b32_e32 v0, s4 6603; GFX7-NEXT: v_mov_b32_e32 v1, s5 6604; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6605; GFX7-NEXT: flat_store_dword v[0:1], v2 6606; GFX7-NEXT: s_endpgm 6607; 6608; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6609; GFX10-WGP: ; %bb.0: ; %entry 6610; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6611; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6612; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6613; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6614; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6615; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6616; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6617; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6618; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6619; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6620; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6621; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6622; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6623; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6624; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6625; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6626; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6627; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6628; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6629; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6630; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6631; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6632; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6633; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6634; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6635; GFX10-WGP-NEXT: s_endpgm 6636; 6637; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6638; GFX10-CU: ; %bb.0: ; %entry 6639; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6640; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6641; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6642; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6643; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6644; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6645; GFX10-CU-NEXT: s_mov_b32 s6, s4 6646; GFX10-CU-NEXT: s_mov_b32 s7, s5 6647; GFX10-CU-NEXT: s_mov_b32 s11, s12 6648; GFX10-CU-NEXT: s_mov_b32 s10, s13 6649; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6650; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6651; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6652; GFX10-CU-NEXT: s_mov_b32 s7, s10 6653; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6654; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6655; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6656; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6657; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6658; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6659; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6660; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6661; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6662; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6663; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6664; GFX10-CU-NEXT: s_endpgm 6665; 6666; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6667; SKIP-CACHE-INV: ; %bb.0: ; %entry 6668; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6669; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6670; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6671; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6672; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6673; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6674; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6675; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6676; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6677; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6678; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6679; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6680; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6681; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6682; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6683; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6684; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6685; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6686; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6687; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6688; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6689; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6690; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6691; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6692; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6693; SKIP-CACHE-INV-NEXT: s_endpgm 6694; 6695; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6696; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6697; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6698; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6699; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6700; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6701; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6702; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6703; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6704; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6705; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6706; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6707; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6708; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6709; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6710; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6711; 6712; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6713; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6714; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6715; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6716; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6717; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6718; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6719; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6720; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6721; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6722; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6723; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6724; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6725; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6726; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6727; GFX90A-TGSPLIT-NEXT: s_endpgm 6728; 6729; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6730; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 6731; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6732; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6733; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6734; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6735; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6736; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6737; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6738; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6739; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6740; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6741; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6742; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6743; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6744; GFX940-NOTTGSPLIT-NEXT: s_endpgm 6745; 6746; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6747; GFX940-TGSPLIT: ; %bb.0: ; %entry 6748; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6749; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 6750; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 6751; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6752; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 6753; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 6754; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6755; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6756; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6757; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 6758; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 6759; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6760; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 6761; GFX940-TGSPLIT-NEXT: s_endpgm 6762; 6763; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6764; GFX11-WGP: ; %bb.0: ; %entry 6765; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6766; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6767; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6768; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 6769; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 6770; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 6771; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6772; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 6773; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6774; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6775; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6776; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 6777; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 6778; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6779; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 6780; GFX11-WGP-NEXT: s_endpgm 6781; 6782; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6783; GFX11-CU: ; %bb.0: ; %entry 6784; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6785; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6786; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6787; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 6788; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 6789; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 6790; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6791; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 6792; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6793; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6794; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 6795; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 6796; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 6797; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6798; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 6799; GFX11-CU-NEXT: s_endpgm 6800; 6801; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6802; GFX12-WGP: ; %bb.0: ; %entry 6803; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6804; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 6805; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 6806; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 6807; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 6808; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 6809; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6810; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 6811; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6812; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6813; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6814; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 6815; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 6816; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 6817; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 6818; GFX12-WGP-NEXT: s_endpgm 6819; 6820; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: 6821; GFX12-CU: ; %bb.0: ; %entry 6822; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 6823; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 6824; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 6825; GFX12-CU-NEXT: s_wait_kmcnt 0x0 6826; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 6827; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 6828; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6829; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 6830; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6831; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6832; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 6833; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 6834; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 6835; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 6836; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 6837; GFX12-CU-NEXT: s_endpgm 6838 ptr %out, i32 %in, i32 %old) { 6839entry: 6840 %gep = getelementptr i32, ptr %out, i32 4 6841 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic 6842 %val0 = extractvalue { i32, i1 } %val, 0 6843 store i32 %val0, ptr %out, align 4 6844 ret void 6845} 6846 6847define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( 6848; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6849; GFX7: ; %bb.0: ; %entry 6850; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 6851; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6852; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 6853; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 6854; GFX7-NEXT: s_mov_b64 s[12:13], 16 6855; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6856; GFX7-NEXT: s_mov_b32 s6, s4 6857; GFX7-NEXT: s_mov_b32 s7, s5 6858; GFX7-NEXT: s_mov_b32 s11, s12 6859; GFX7-NEXT: s_mov_b32 s10, s13 6860; GFX7-NEXT: s_add_u32 s6, s6, s11 6861; GFX7-NEXT: s_addc_u32 s10, s7, s10 6862; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6863; GFX7-NEXT: s_mov_b32 s7, s10 6864; GFX7-NEXT: v_mov_b32_e32 v2, s9 6865; GFX7-NEXT: v_mov_b32_e32 v0, s8 6866; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6867; GFX7-NEXT: v_mov_b32_e32 v3, v0 6868; GFX7-NEXT: v_mov_b32_e32 v0, s6 6869; GFX7-NEXT: v_mov_b32_e32 v1, s7 6870; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6871; GFX7-NEXT: v_mov_b32_e32 v0, s4 6872; GFX7-NEXT: v_mov_b32_e32 v1, s5 6873; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6874; GFX7-NEXT: flat_store_dword v[0:1], v2 6875; GFX7-NEXT: s_endpgm 6876; 6877; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6878; GFX10-WGP: ; %bb.0: ; %entry 6879; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 6880; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6881; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 6882; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 6883; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 6884; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 6885; GFX10-WGP-NEXT: s_mov_b32 s6, s4 6886; GFX10-WGP-NEXT: s_mov_b32 s7, s5 6887; GFX10-WGP-NEXT: s_mov_b32 s11, s12 6888; GFX10-WGP-NEXT: s_mov_b32 s10, s13 6889; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 6890; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 6891; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6892; GFX10-WGP-NEXT: s_mov_b32 s7, s10 6893; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 6894; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 6895; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6896; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 6897; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 6898; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 6899; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6900; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 6901; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 6902; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6903; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 6904; GFX10-WGP-NEXT: s_endpgm 6905; 6906; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6907; GFX10-CU: ; %bb.0: ; %entry 6908; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 6909; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 6910; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 6911; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 6912; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 6913; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 6914; GFX10-CU-NEXT: s_mov_b32 s6, s4 6915; GFX10-CU-NEXT: s_mov_b32 s7, s5 6916; GFX10-CU-NEXT: s_mov_b32 s11, s12 6917; GFX10-CU-NEXT: s_mov_b32 s10, s13 6918; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 6919; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 6920; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 6921; GFX10-CU-NEXT: s_mov_b32 s7, s10 6922; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 6923; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 6924; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6925; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 6926; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 6927; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 6928; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6929; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 6930; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 6931; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6932; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 6933; GFX10-CU-NEXT: s_endpgm 6934; 6935; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6936; SKIP-CACHE-INV: ; %bb.0: ; %entry 6937; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 6938; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 6939; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 6940; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 6941; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 6942; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 6943; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 6944; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 6945; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 6946; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 6947; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 6948; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 6949; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 6950; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 6951; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 6952; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 6953; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6954; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 6955; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 6956; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 6957; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 6958; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 6959; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 6960; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6961; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 6962; SKIP-CACHE-INV-NEXT: s_endpgm 6963; 6964; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6965; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 6966; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6967; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6968; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6969; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6970; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6971; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6972; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6973; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6974; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6975; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6976; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6977; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6978; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 6979; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 6980; 6981; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6982; GFX90A-TGSPLIT: ; %bb.0: ; %entry 6983; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 6984; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 6985; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 6986; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 6987; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 6988; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 6989; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 6990; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 6991; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6992; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 6993; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 6994; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 6995; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 6996; GFX90A-TGSPLIT-NEXT: s_endpgm 6997; 6998; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 6999; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7000; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7001; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7002; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7003; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7004; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7005; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7006; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7007; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7008; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7009; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7010; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7011; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7012; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7013; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7014; 7015; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 7016; GFX940-TGSPLIT: ; %bb.0: ; %entry 7017; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7018; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7019; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7020; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7021; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7022; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7023; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7024; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7025; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7026; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7027; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7028; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7029; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7030; GFX940-TGSPLIT-NEXT: s_endpgm 7031; 7032; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 7033; GFX11-WGP: ; %bb.0: ; %entry 7034; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7035; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7036; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7037; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7038; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7039; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7040; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7041; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7042; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7043; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7044; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7045; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7046; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7047; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7048; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7049; GFX11-WGP-NEXT: s_endpgm 7050; 7051; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 7052; GFX11-CU: ; %bb.0: ; %entry 7053; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7054; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7055; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7056; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7057; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7058; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7059; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7060; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7061; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7062; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7063; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7064; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7065; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7066; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7067; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7068; GFX11-CU-NEXT: s_endpgm 7069; 7070; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 7071; GFX12-WGP: ; %bb.0: ; %entry 7072; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7073; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7074; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7075; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7076; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7077; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7078; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7079; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7080; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7081; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7082; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7083; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7084; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7085; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7086; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7087; GFX12-WGP-NEXT: s_endpgm 7088; 7089; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: 7090; GFX12-CU: ; %bb.0: ; %entry 7091; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7092; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7093; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7094; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7095; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7096; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7097; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7098; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7099; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7100; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7101; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7102; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7103; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7104; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 7105; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7106; GFX12-CU-NEXT: s_endpgm 7107 ptr %out, i32 %in, i32 %old) { 7108entry: 7109 %gep = getelementptr i32, ptr %out, i32 4 7110 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic 7111 %val0 = extractvalue { i32, i1 } %val, 0 7112 store i32 %val0, ptr %out, align 4 7113 ret void 7114} 7115 7116define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( 7117; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7118; GFX7: ; %bb.0: ; %entry 7119; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7120; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7121; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7122; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7123; GFX7-NEXT: s_mov_b64 s[12:13], 16 7124; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7125; GFX7-NEXT: s_mov_b32 s6, s4 7126; GFX7-NEXT: s_mov_b32 s7, s5 7127; GFX7-NEXT: s_mov_b32 s11, s12 7128; GFX7-NEXT: s_mov_b32 s10, s13 7129; GFX7-NEXT: s_add_u32 s6, s6, s11 7130; GFX7-NEXT: s_addc_u32 s10, s7, s10 7131; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7132; GFX7-NEXT: s_mov_b32 s7, s10 7133; GFX7-NEXT: v_mov_b32_e32 v2, s9 7134; GFX7-NEXT: v_mov_b32_e32 v0, s8 7135; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7136; GFX7-NEXT: v_mov_b32_e32 v3, v0 7137; GFX7-NEXT: v_mov_b32_e32 v0, s6 7138; GFX7-NEXT: v_mov_b32_e32 v1, s7 7139; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7140; GFX7-NEXT: v_mov_b32_e32 v0, s4 7141; GFX7-NEXT: v_mov_b32_e32 v1, s5 7142; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7143; GFX7-NEXT: flat_store_dword v[0:1], v2 7144; GFX7-NEXT: s_endpgm 7145; 7146; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7147; GFX10-WGP: ; %bb.0: ; %entry 7148; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7149; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7150; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7151; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7152; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7153; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7154; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7155; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7156; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7157; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7158; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7159; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7160; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7161; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7162; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7163; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7164; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7165; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7166; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7167; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7168; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7169; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7170; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7171; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7172; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7173; GFX10-WGP-NEXT: s_endpgm 7174; 7175; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7176; GFX10-CU: ; %bb.0: ; %entry 7177; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7178; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7179; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7180; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7181; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7182; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7183; GFX10-CU-NEXT: s_mov_b32 s6, s4 7184; GFX10-CU-NEXT: s_mov_b32 s7, s5 7185; GFX10-CU-NEXT: s_mov_b32 s11, s12 7186; GFX10-CU-NEXT: s_mov_b32 s10, s13 7187; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7188; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7189; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7190; GFX10-CU-NEXT: s_mov_b32 s7, s10 7191; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7192; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7193; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7194; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7195; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7196; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7197; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7198; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7199; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7200; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7201; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7202; GFX10-CU-NEXT: s_endpgm 7203; 7204; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7205; SKIP-CACHE-INV: ; %bb.0: ; %entry 7206; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7207; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7208; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7209; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7210; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7211; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7212; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7213; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7214; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7215; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7216; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7217; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7218; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7219; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7220; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7221; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7222; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7224; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7225; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7226; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7227; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7228; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7229; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7230; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7231; SKIP-CACHE-INV-NEXT: s_endpgm 7232; 7233; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7234; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7235; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7236; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7237; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7238; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7239; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7240; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7241; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7242; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7243; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7244; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7245; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7246; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7247; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7248; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7249; 7250; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7251; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7252; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7253; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7254; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7255; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7256; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7257; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7258; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7259; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7260; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7261; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7262; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7263; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7264; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7265; GFX90A-TGSPLIT-NEXT: s_endpgm 7266; 7267; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7268; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7269; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7270; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7271; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7272; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7273; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7274; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7275; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7276; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7277; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7278; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7279; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7280; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7281; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7282; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7283; 7284; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7285; GFX940-TGSPLIT: ; %bb.0: ; %entry 7286; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7287; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7288; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7289; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7290; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7291; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7292; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7293; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7294; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7295; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7296; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7297; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7298; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7299; GFX940-TGSPLIT-NEXT: s_endpgm 7300; 7301; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7302; GFX11-WGP: ; %bb.0: ; %entry 7303; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7304; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7305; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7306; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7307; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7308; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7309; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7310; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7311; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7312; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7313; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7314; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7315; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7316; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7317; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7318; GFX11-WGP-NEXT: s_endpgm 7319; 7320; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7321; GFX11-CU: ; %bb.0: ; %entry 7322; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7323; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7324; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7325; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7326; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7327; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7328; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7329; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7330; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7331; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7332; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7333; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7334; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7335; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7336; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7337; GFX11-CU-NEXT: s_endpgm 7338; 7339; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7340; GFX12-WGP: ; %bb.0: ; %entry 7341; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7342; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7343; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7344; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7345; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7346; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7347; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7348; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7349; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7350; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7351; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7352; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7353; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7354; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7355; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7356; GFX12-WGP-NEXT: s_endpgm 7357; 7358; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: 7359; GFX12-CU: ; %bb.0: ; %entry 7360; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7361; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7362; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7363; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7364; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7365; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7366; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7367; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7368; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7369; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7370; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7371; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7372; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7373; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 7374; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7375; GFX12-CU-NEXT: s_endpgm 7376 ptr %out, i32 %in, i32 %old) { 7377entry: 7378 %gep = getelementptr i32, ptr %out, i32 4 7379 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire 7380 %val0 = extractvalue { i32, i1 } %val, 0 7381 store i32 %val0, ptr %out, align 4 7382 ret void 7383} 7384 7385define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( 7386; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7387; GFX7: ; %bb.0: ; %entry 7388; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7389; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7390; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7391; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7392; GFX7-NEXT: s_mov_b64 s[12:13], 16 7393; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7394; GFX7-NEXT: s_mov_b32 s6, s4 7395; GFX7-NEXT: s_mov_b32 s7, s5 7396; GFX7-NEXT: s_mov_b32 s11, s12 7397; GFX7-NEXT: s_mov_b32 s10, s13 7398; GFX7-NEXT: s_add_u32 s6, s6, s11 7399; GFX7-NEXT: s_addc_u32 s10, s7, s10 7400; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7401; GFX7-NEXT: s_mov_b32 s7, s10 7402; GFX7-NEXT: v_mov_b32_e32 v2, s9 7403; GFX7-NEXT: v_mov_b32_e32 v0, s8 7404; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7405; GFX7-NEXT: v_mov_b32_e32 v3, v0 7406; GFX7-NEXT: v_mov_b32_e32 v0, s6 7407; GFX7-NEXT: v_mov_b32_e32 v1, s7 7408; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7409; GFX7-NEXT: v_mov_b32_e32 v0, s4 7410; GFX7-NEXT: v_mov_b32_e32 v1, s5 7411; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7412; GFX7-NEXT: flat_store_dword v[0:1], v2 7413; GFX7-NEXT: s_endpgm 7414; 7415; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7416; GFX10-WGP: ; %bb.0: ; %entry 7417; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7418; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7419; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7420; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7421; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7422; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7423; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7424; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7425; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7426; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7427; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7428; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7429; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7430; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7431; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7432; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7433; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7434; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7435; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7436; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7437; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7438; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7439; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7440; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7441; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7442; GFX10-WGP-NEXT: s_endpgm 7443; 7444; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7445; GFX10-CU: ; %bb.0: ; %entry 7446; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7447; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7448; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7449; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7450; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7451; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7452; GFX10-CU-NEXT: s_mov_b32 s6, s4 7453; GFX10-CU-NEXT: s_mov_b32 s7, s5 7454; GFX10-CU-NEXT: s_mov_b32 s11, s12 7455; GFX10-CU-NEXT: s_mov_b32 s10, s13 7456; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7457; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7458; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7459; GFX10-CU-NEXT: s_mov_b32 s7, s10 7460; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7461; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7462; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7463; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7464; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7465; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7466; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7467; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7468; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7469; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7470; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7471; GFX10-CU-NEXT: s_endpgm 7472; 7473; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7474; SKIP-CACHE-INV: ; %bb.0: ; %entry 7475; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7476; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7477; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7478; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7479; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7480; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7481; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7482; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7483; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7484; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7485; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7486; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7487; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7488; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7490; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7491; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7492; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7493; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7494; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7495; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7497; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7498; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7499; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7500; SKIP-CACHE-INV-NEXT: s_endpgm 7501; 7502; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7503; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7504; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7505; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7506; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7507; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7508; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7509; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7510; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7511; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7512; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7513; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7514; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7515; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7516; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7517; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7518; 7519; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7520; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7521; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7522; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7523; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7524; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7525; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7526; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7527; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7528; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7529; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7530; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7531; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7532; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7533; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7534; GFX90A-TGSPLIT-NEXT: s_endpgm 7535; 7536; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7537; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7538; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7539; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7540; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7541; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7542; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7543; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7544; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7545; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7546; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7547; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7548; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7549; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7550; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7551; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7552; 7553; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7554; GFX940-TGSPLIT: ; %bb.0: ; %entry 7555; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7556; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7557; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7558; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7559; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7560; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7561; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7562; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7563; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7564; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7565; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7566; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7567; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7568; GFX940-TGSPLIT-NEXT: s_endpgm 7569; 7570; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7571; GFX11-WGP: ; %bb.0: ; %entry 7572; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7573; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7574; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7575; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7576; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7577; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7578; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7579; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7580; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7581; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7582; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7583; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7584; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7585; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7586; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7587; GFX11-WGP-NEXT: s_endpgm 7588; 7589; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7590; GFX11-CU: ; %bb.0: ; %entry 7591; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7592; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7593; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7594; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7595; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7596; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7597; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7598; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7599; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7600; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7601; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7602; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7603; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7604; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7605; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7606; GFX11-CU-NEXT: s_endpgm 7607; 7608; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7609; GFX12-WGP: ; %bb.0: ; %entry 7610; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7611; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7612; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7613; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7614; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7615; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7616; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7617; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7618; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7619; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7620; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7621; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7622; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7623; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7624; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7625; GFX12-WGP-NEXT: s_endpgm 7626; 7627; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: 7628; GFX12-CU: ; %bb.0: ; %entry 7629; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7630; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7631; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7632; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7633; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7634; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7635; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7636; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7637; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7638; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7639; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7640; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7641; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7642; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 7643; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7644; GFX12-CU-NEXT: s_endpgm 7645 ptr %out, i32 %in, i32 %old) { 7646entry: 7647 %gep = getelementptr i32, ptr %out, i32 4 7648 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire 7649 %val0 = extractvalue { i32, i1 } %val, 0 7650 store i32 %val0, ptr %out, align 4 7651 ret void 7652} 7653 7654define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( 7655; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7656; GFX7: ; %bb.0: ; %entry 7657; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7658; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7659; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7660; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7661; GFX7-NEXT: s_mov_b64 s[12:13], 16 7662; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7663; GFX7-NEXT: s_mov_b32 s6, s4 7664; GFX7-NEXT: s_mov_b32 s7, s5 7665; GFX7-NEXT: s_mov_b32 s11, s12 7666; GFX7-NEXT: s_mov_b32 s10, s13 7667; GFX7-NEXT: s_add_u32 s6, s6, s11 7668; GFX7-NEXT: s_addc_u32 s10, s7, s10 7669; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7670; GFX7-NEXT: s_mov_b32 s7, s10 7671; GFX7-NEXT: v_mov_b32_e32 v2, s9 7672; GFX7-NEXT: v_mov_b32_e32 v0, s8 7673; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7674; GFX7-NEXT: v_mov_b32_e32 v3, v0 7675; GFX7-NEXT: v_mov_b32_e32 v0, s6 7676; GFX7-NEXT: v_mov_b32_e32 v1, s7 7677; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7678; GFX7-NEXT: v_mov_b32_e32 v0, s4 7679; GFX7-NEXT: v_mov_b32_e32 v1, s5 7680; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7681; GFX7-NEXT: flat_store_dword v[0:1], v2 7682; GFX7-NEXT: s_endpgm 7683; 7684; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7685; GFX10-WGP: ; %bb.0: ; %entry 7686; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7687; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7688; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7689; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7690; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7691; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7692; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7693; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7694; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7695; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7696; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7697; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7698; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7699; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7700; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7701; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7702; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7703; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7704; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7705; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7706; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7707; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7708; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7709; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7710; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7711; GFX10-WGP-NEXT: s_endpgm 7712; 7713; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7714; GFX10-CU: ; %bb.0: ; %entry 7715; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7716; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7717; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7718; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7719; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7720; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7721; GFX10-CU-NEXT: s_mov_b32 s6, s4 7722; GFX10-CU-NEXT: s_mov_b32 s7, s5 7723; GFX10-CU-NEXT: s_mov_b32 s11, s12 7724; GFX10-CU-NEXT: s_mov_b32 s10, s13 7725; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7726; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7727; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7728; GFX10-CU-NEXT: s_mov_b32 s7, s10 7729; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7730; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 7731; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7732; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 7733; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 7734; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 7735; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7736; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 7737; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 7738; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7739; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 7740; GFX10-CU-NEXT: s_endpgm 7741; 7742; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7743; SKIP-CACHE-INV: ; %bb.0: ; %entry 7744; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 7745; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 7746; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 7747; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 7748; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 7749; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 7750; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 7751; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 7752; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 7753; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 7754; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 7755; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 7756; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 7757; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 7758; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 7759; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 7760; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7761; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 7762; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 7763; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 7764; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 7766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 7767; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7768; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 7769; SKIP-CACHE-INV-NEXT: s_endpgm 7770; 7771; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7772; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 7773; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7774; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7775; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7776; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7777; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7778; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7779; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7780; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7781; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7782; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7783; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7784; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7785; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 7786; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 7787; 7788; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7789; GFX90A-TGSPLIT: ; %bb.0: ; %entry 7790; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 7791; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 7792; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 7793; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7794; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 7795; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 7796; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7797; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7798; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7799; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 7800; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 7801; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7802; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 7803; GFX90A-TGSPLIT-NEXT: s_endpgm 7804; 7805; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7806; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 7807; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7808; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7809; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7810; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7811; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7812; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7813; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7814; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7815; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7816; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7817; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7818; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7819; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7820; GFX940-NOTTGSPLIT-NEXT: s_endpgm 7821; 7822; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7823; GFX940-TGSPLIT: ; %bb.0: ; %entry 7824; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 7825; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 7826; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 7827; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 7828; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 7829; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 7830; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7831; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 7832; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7833; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 7834; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 7835; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 7836; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 7837; GFX940-TGSPLIT-NEXT: s_endpgm 7838; 7839; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7840; GFX11-WGP: ; %bb.0: ; %entry 7841; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7842; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7843; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7844; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 7845; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 7846; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 7847; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7848; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 7849; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7850; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7851; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7852; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 7853; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 7854; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7855; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 7856; GFX11-WGP-NEXT: s_endpgm 7857; 7858; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7859; GFX11-CU: ; %bb.0: ; %entry 7860; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7861; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7862; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7863; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 7864; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 7865; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 7866; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7867; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 7868; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7869; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7870; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 7871; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 7872; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 7873; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7874; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 7875; GFX11-CU-NEXT: s_endpgm 7876; 7877; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7878; GFX12-WGP: ; %bb.0: ; %entry 7879; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7880; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 7881; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 7882; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 7883; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 7884; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 7885; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7886; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 7887; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7888; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7889; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7890; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 7891; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 7892; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 7893; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 7894; GFX12-WGP-NEXT: s_endpgm 7895; 7896; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: 7897; GFX12-CU: ; %bb.0: ; %entry 7898; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 7899; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 7900; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 7901; GFX12-CU-NEXT: s_wait_kmcnt 0x0 7902; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 7903; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 7904; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7905; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 7906; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7907; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7908; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 7909; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 7910; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 7911; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 7912; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 7913; GFX12-CU-NEXT: s_endpgm 7914 ptr %out, i32 %in, i32 %old) { 7915entry: 7916 %gep = getelementptr i32, ptr %out, i32 4 7917 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire 7918 %val0 = extractvalue { i32, i1 } %val, 0 7919 store i32 %val0, ptr %out, align 4 7920 ret void 7921} 7922 7923define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( 7924; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 7925; GFX7: ; %bb.0: ; %entry 7926; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 7927; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7928; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 7929; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 7930; GFX7-NEXT: s_mov_b64 s[12:13], 16 7931; GFX7-NEXT: s_waitcnt lgkmcnt(0) 7932; GFX7-NEXT: s_mov_b32 s6, s4 7933; GFX7-NEXT: s_mov_b32 s7, s5 7934; GFX7-NEXT: s_mov_b32 s11, s12 7935; GFX7-NEXT: s_mov_b32 s10, s13 7936; GFX7-NEXT: s_add_u32 s6, s6, s11 7937; GFX7-NEXT: s_addc_u32 s10, s7, s10 7938; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7939; GFX7-NEXT: s_mov_b32 s7, s10 7940; GFX7-NEXT: v_mov_b32_e32 v2, s9 7941; GFX7-NEXT: v_mov_b32_e32 v0, s8 7942; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7943; GFX7-NEXT: v_mov_b32_e32 v3, v0 7944; GFX7-NEXT: v_mov_b32_e32 v0, s6 7945; GFX7-NEXT: v_mov_b32_e32 v1, s7 7946; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7947; GFX7-NEXT: v_mov_b32_e32 v0, s4 7948; GFX7-NEXT: v_mov_b32_e32 v1, s5 7949; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7950; GFX7-NEXT: flat_store_dword v[0:1], v2 7951; GFX7-NEXT: s_endpgm 7952; 7953; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 7954; GFX10-WGP: ; %bb.0: ; %entry 7955; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 7956; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7957; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 7958; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 7959; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 7960; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 7961; GFX10-WGP-NEXT: s_mov_b32 s6, s4 7962; GFX10-WGP-NEXT: s_mov_b32 s7, s5 7963; GFX10-WGP-NEXT: s_mov_b32 s11, s12 7964; GFX10-WGP-NEXT: s_mov_b32 s10, s13 7965; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 7966; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 7967; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7968; GFX10-WGP-NEXT: s_mov_b32 s7, s10 7969; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 7970; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 7971; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 7972; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 7973; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 7974; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 7975; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7976; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 7977; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 7978; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7979; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 7980; GFX10-WGP-NEXT: s_endpgm 7981; 7982; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 7983; GFX10-CU: ; %bb.0: ; %entry 7984; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 7985; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 7986; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 7987; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 7988; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 7989; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 7990; GFX10-CU-NEXT: s_mov_b32 s6, s4 7991; GFX10-CU-NEXT: s_mov_b32 s7, s5 7992; GFX10-CU-NEXT: s_mov_b32 s11, s12 7993; GFX10-CU-NEXT: s_mov_b32 s10, s13 7994; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 7995; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 7996; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 7997; GFX10-CU-NEXT: s_mov_b32 s7, s10 7998; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 7999; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8000; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8001; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8002; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8003; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8004; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8005; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8006; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8007; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8008; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8009; GFX10-CU-NEXT: s_endpgm 8010; 8011; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8012; SKIP-CACHE-INV: ; %bb.0: ; %entry 8013; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8014; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8015; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8016; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8017; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8018; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8019; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8020; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8021; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8022; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8023; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8024; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8025; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8026; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8027; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8028; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8029; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8030; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8031; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8032; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8033; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8034; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8035; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8036; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8037; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8038; SKIP-CACHE-INV-NEXT: s_endpgm 8039; 8040; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8041; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8042; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8043; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8044; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8045; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8046; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8047; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8048; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8049; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8050; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8051; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8052; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8053; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8054; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8055; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8056; 8057; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8058; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8059; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8060; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8061; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8062; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8063; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8064; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8065; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8066; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8067; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8068; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8069; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8070; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8071; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8072; GFX90A-TGSPLIT-NEXT: s_endpgm 8073; 8074; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8075; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8076; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8077; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8078; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8079; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8080; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8081; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8082; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8083; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8084; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8085; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8086; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8087; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8088; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8089; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8090; 8091; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8092; GFX940-TGSPLIT: ; %bb.0: ; %entry 8093; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8094; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8095; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8096; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8097; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8098; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8099; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8100; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8101; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8102; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8103; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8104; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8105; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8106; GFX940-TGSPLIT-NEXT: s_endpgm 8107; 8108; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8109; GFX11-WGP: ; %bb.0: ; %entry 8110; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8111; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8112; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8113; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8114; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8115; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8116; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8117; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8118; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8119; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8120; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8121; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8122; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8123; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8124; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8125; GFX11-WGP-NEXT: s_endpgm 8126; 8127; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8128; GFX11-CU: ; %bb.0: ; %entry 8129; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8130; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8131; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8132; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8133; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8134; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8135; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8136; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8137; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8138; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8139; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8140; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8141; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8142; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8143; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8144; GFX11-CU-NEXT: s_endpgm 8145; 8146; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8147; GFX12-WGP: ; %bb.0: ; %entry 8148; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8149; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8150; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8151; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8152; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8153; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8154; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8155; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8156; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8157; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8158; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8159; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8160; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8161; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8162; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8163; GFX12-WGP-NEXT: s_endpgm 8164; 8165; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: 8166; GFX12-CU: ; %bb.0: ; %entry 8167; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8168; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8169; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8170; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8171; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8172; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8173; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8174; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8175; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8176; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8177; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8178; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8179; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8180; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 8181; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8182; GFX12-CU-NEXT: s_endpgm 8183 ptr %out, i32 %in, i32 %old) { 8184entry: 8185 %gep = getelementptr i32, ptr %out, i32 4 8186 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire 8187 %val0 = extractvalue { i32, i1 } %val, 0 8188 store i32 %val0, ptr %out, align 4 8189 ret void 8190} 8191 8192define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( 8193; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8194; GFX7: ; %bb.0: ; %entry 8195; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8196; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8197; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8198; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8199; GFX7-NEXT: s_mov_b64 s[12:13], 16 8200; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8201; GFX7-NEXT: s_mov_b32 s6, s4 8202; GFX7-NEXT: s_mov_b32 s7, s5 8203; GFX7-NEXT: s_mov_b32 s11, s12 8204; GFX7-NEXT: s_mov_b32 s10, s13 8205; GFX7-NEXT: s_add_u32 s6, s6, s11 8206; GFX7-NEXT: s_addc_u32 s10, s7, s10 8207; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8208; GFX7-NEXT: s_mov_b32 s7, s10 8209; GFX7-NEXT: v_mov_b32_e32 v2, s9 8210; GFX7-NEXT: v_mov_b32_e32 v0, s8 8211; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8212; GFX7-NEXT: v_mov_b32_e32 v3, v0 8213; GFX7-NEXT: v_mov_b32_e32 v0, s6 8214; GFX7-NEXT: v_mov_b32_e32 v1, s7 8215; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8216; GFX7-NEXT: v_mov_b32_e32 v0, s4 8217; GFX7-NEXT: v_mov_b32_e32 v1, s5 8218; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8219; GFX7-NEXT: flat_store_dword v[0:1], v2 8220; GFX7-NEXT: s_endpgm 8221; 8222; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8223; GFX10-WGP: ; %bb.0: ; %entry 8224; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8225; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8226; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8227; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8228; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8229; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8230; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8231; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8232; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8233; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8234; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8235; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8236; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8237; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8238; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8239; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8240; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8241; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8242; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8243; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8244; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8245; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8246; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8247; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8248; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8249; GFX10-WGP-NEXT: s_endpgm 8250; 8251; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8252; GFX10-CU: ; %bb.0: ; %entry 8253; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8254; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8255; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8256; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8257; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8258; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8259; GFX10-CU-NEXT: s_mov_b32 s6, s4 8260; GFX10-CU-NEXT: s_mov_b32 s7, s5 8261; GFX10-CU-NEXT: s_mov_b32 s11, s12 8262; GFX10-CU-NEXT: s_mov_b32 s10, s13 8263; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8264; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8265; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8266; GFX10-CU-NEXT: s_mov_b32 s7, s10 8267; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8268; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8269; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8270; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8271; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8272; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8273; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8274; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8275; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8276; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8277; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8278; GFX10-CU-NEXT: s_endpgm 8279; 8280; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8281; SKIP-CACHE-INV: ; %bb.0: ; %entry 8282; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8283; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8284; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8285; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8286; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8287; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8288; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8289; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8290; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8291; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8292; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8293; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8294; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8295; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8296; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8297; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8298; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8299; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8300; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8301; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8302; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8303; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8304; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8305; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8306; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8307; SKIP-CACHE-INV-NEXT: s_endpgm 8308; 8309; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8310; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8311; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8312; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8313; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8314; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8315; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8316; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8317; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8318; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8319; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8320; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8321; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8322; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8323; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8324; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8325; 8326; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8327; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8328; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8329; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8330; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8331; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8332; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8333; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8334; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8335; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8336; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8337; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8338; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8339; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8340; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8341; GFX90A-TGSPLIT-NEXT: s_endpgm 8342; 8343; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8344; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8345; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8346; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8347; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8348; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8349; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8350; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8351; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8352; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8353; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8354; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8355; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8356; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8357; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8358; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8359; 8360; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8361; GFX940-TGSPLIT: ; %bb.0: ; %entry 8362; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8363; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8364; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8365; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8366; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8367; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8368; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8369; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8370; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8371; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8372; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8373; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8374; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8375; GFX940-TGSPLIT-NEXT: s_endpgm 8376; 8377; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8378; GFX11-WGP: ; %bb.0: ; %entry 8379; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8380; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8381; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8382; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8383; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8384; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8385; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8386; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8387; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8388; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8389; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8390; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8391; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8392; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8393; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8394; GFX11-WGP-NEXT: s_endpgm 8395; 8396; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8397; GFX11-CU: ; %bb.0: ; %entry 8398; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8399; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8400; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8401; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8402; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8403; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8404; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8405; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8406; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8407; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8408; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8409; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8410; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8411; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8412; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8413; GFX11-CU-NEXT: s_endpgm 8414; 8415; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8416; GFX12-WGP: ; %bb.0: ; %entry 8417; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8418; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8419; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8420; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8421; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8422; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8423; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8424; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8425; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8426; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8427; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8428; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8429; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8430; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8431; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8432; GFX12-WGP-NEXT: s_endpgm 8433; 8434; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: 8435; GFX12-CU: ; %bb.0: ; %entry 8436; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8437; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8438; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8439; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8440; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8441; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8442; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8443; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8444; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8445; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8446; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8447; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8448; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8449; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 8450; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8451; GFX12-CU-NEXT: s_endpgm 8452 ptr %out, i32 %in, i32 %old) { 8453entry: 8454 %gep = getelementptr i32, ptr %out, i32 4 8455 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire 8456 %val0 = extractvalue { i32, i1 } %val, 0 8457 store i32 %val0, ptr %out, align 4 8458 ret void 8459} 8460 8461define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( 8462; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8463; GFX7: ; %bb.0: ; %entry 8464; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8465; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8466; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8467; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8468; GFX7-NEXT: s_mov_b64 s[12:13], 16 8469; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8470; GFX7-NEXT: s_mov_b32 s6, s4 8471; GFX7-NEXT: s_mov_b32 s7, s5 8472; GFX7-NEXT: s_mov_b32 s11, s12 8473; GFX7-NEXT: s_mov_b32 s10, s13 8474; GFX7-NEXT: s_add_u32 s6, s6, s11 8475; GFX7-NEXT: s_addc_u32 s10, s7, s10 8476; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8477; GFX7-NEXT: s_mov_b32 s7, s10 8478; GFX7-NEXT: v_mov_b32_e32 v2, s9 8479; GFX7-NEXT: v_mov_b32_e32 v0, s8 8480; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8481; GFX7-NEXT: v_mov_b32_e32 v3, v0 8482; GFX7-NEXT: v_mov_b32_e32 v0, s6 8483; GFX7-NEXT: v_mov_b32_e32 v1, s7 8484; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8485; GFX7-NEXT: v_mov_b32_e32 v0, s4 8486; GFX7-NEXT: v_mov_b32_e32 v1, s5 8487; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8488; GFX7-NEXT: flat_store_dword v[0:1], v2 8489; GFX7-NEXT: s_endpgm 8490; 8491; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8492; GFX10-WGP: ; %bb.0: ; %entry 8493; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8494; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8495; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8496; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8497; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8498; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8499; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8500; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8501; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8502; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8503; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8504; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8505; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8506; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8507; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8508; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8509; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8510; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8511; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8512; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8513; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8514; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8515; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8516; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8517; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8518; GFX10-WGP-NEXT: s_endpgm 8519; 8520; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8521; GFX10-CU: ; %bb.0: ; %entry 8522; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8523; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8524; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8525; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8526; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8527; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8528; GFX10-CU-NEXT: s_mov_b32 s6, s4 8529; GFX10-CU-NEXT: s_mov_b32 s7, s5 8530; GFX10-CU-NEXT: s_mov_b32 s11, s12 8531; GFX10-CU-NEXT: s_mov_b32 s10, s13 8532; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8533; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8534; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8535; GFX10-CU-NEXT: s_mov_b32 s7, s10 8536; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8537; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8538; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8539; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8540; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8541; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8542; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8543; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8544; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8545; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8546; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8547; GFX10-CU-NEXT: s_endpgm 8548; 8549; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8550; SKIP-CACHE-INV: ; %bb.0: ; %entry 8551; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8552; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8553; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8554; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8555; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8556; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8557; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8558; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8559; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8560; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8561; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8562; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8563; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8564; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8565; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8566; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8567; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8568; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8569; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8570; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8571; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8572; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8573; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8574; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8575; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8576; SKIP-CACHE-INV-NEXT: s_endpgm 8577; 8578; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8579; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8580; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8581; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8582; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8583; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8584; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8585; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8586; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8587; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8588; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8589; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8590; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8591; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8592; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8593; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8594; 8595; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8596; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8597; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8598; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8599; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8600; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8601; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8602; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8603; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8604; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8605; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8606; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8607; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8608; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8609; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8610; GFX90A-TGSPLIT-NEXT: s_endpgm 8611; 8612; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8613; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8614; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8615; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8616; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8617; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8618; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8619; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8620; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8621; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8622; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8623; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8624; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8625; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8626; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8627; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8628; 8629; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8630; GFX940-TGSPLIT: ; %bb.0: ; %entry 8631; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8632; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8633; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8634; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8635; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8636; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8637; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8638; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8639; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8640; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8641; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8642; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8643; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8644; GFX940-TGSPLIT-NEXT: s_endpgm 8645; 8646; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8647; GFX11-WGP: ; %bb.0: ; %entry 8648; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8649; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8650; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8651; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8652; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8653; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8654; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8655; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8656; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8657; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8658; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8659; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8660; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8661; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8662; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8663; GFX11-WGP-NEXT: s_endpgm 8664; 8665; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8666; GFX11-CU: ; %bb.0: ; %entry 8667; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8668; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8669; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8670; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8671; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8672; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8673; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8674; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8675; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8676; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8677; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8678; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8679; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8680; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8681; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8682; GFX11-CU-NEXT: s_endpgm 8683; 8684; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8685; GFX12-WGP: ; %bb.0: ; %entry 8686; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8687; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8688; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8689; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8690; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8691; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8692; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8693; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8694; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8695; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8696; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8697; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8698; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8699; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8700; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8701; GFX12-WGP-NEXT: s_endpgm 8702; 8703; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: 8704; GFX12-CU: ; %bb.0: ; %entry 8705; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8706; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8707; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8708; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8709; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8710; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8711; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8712; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8713; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8714; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8715; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8716; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8717; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8718; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 8719; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8720; GFX12-CU-NEXT: s_endpgm 8721 ptr %out, i32 %in, i32 %old) { 8722entry: 8723 %gep = getelementptr i32, ptr %out, i32 4 8724 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst 8725 %val0 = extractvalue { i32, i1 } %val, 0 8726 store i32 %val0, ptr %out, align 4 8727 ret void 8728} 8729 8730define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( 8731; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8732; GFX7: ; %bb.0: ; %entry 8733; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 8734; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8735; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 8736; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 8737; GFX7-NEXT: s_mov_b64 s[12:13], 16 8738; GFX7-NEXT: s_waitcnt lgkmcnt(0) 8739; GFX7-NEXT: s_mov_b32 s6, s4 8740; GFX7-NEXT: s_mov_b32 s7, s5 8741; GFX7-NEXT: s_mov_b32 s11, s12 8742; GFX7-NEXT: s_mov_b32 s10, s13 8743; GFX7-NEXT: s_add_u32 s6, s6, s11 8744; GFX7-NEXT: s_addc_u32 s10, s7, s10 8745; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8746; GFX7-NEXT: s_mov_b32 s7, s10 8747; GFX7-NEXT: v_mov_b32_e32 v2, s9 8748; GFX7-NEXT: v_mov_b32_e32 v0, s8 8749; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8750; GFX7-NEXT: v_mov_b32_e32 v3, v0 8751; GFX7-NEXT: v_mov_b32_e32 v0, s6 8752; GFX7-NEXT: v_mov_b32_e32 v1, s7 8753; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8754; GFX7-NEXT: v_mov_b32_e32 v0, s4 8755; GFX7-NEXT: v_mov_b32_e32 v1, s5 8756; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8757; GFX7-NEXT: flat_store_dword v[0:1], v2 8758; GFX7-NEXT: s_endpgm 8759; 8760; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8761; GFX10-WGP: ; %bb.0: ; %entry 8762; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 8763; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8764; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 8765; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 8766; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 8767; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 8768; GFX10-WGP-NEXT: s_mov_b32 s6, s4 8769; GFX10-WGP-NEXT: s_mov_b32 s7, s5 8770; GFX10-WGP-NEXT: s_mov_b32 s11, s12 8771; GFX10-WGP-NEXT: s_mov_b32 s10, s13 8772; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 8773; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 8774; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8775; GFX10-WGP-NEXT: s_mov_b32 s7, s10 8776; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 8777; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 8778; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8779; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 8780; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 8781; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 8782; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8783; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 8784; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 8785; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8786; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 8787; GFX10-WGP-NEXT: s_endpgm 8788; 8789; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8790; GFX10-CU: ; %bb.0: ; %entry 8791; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 8792; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 8793; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 8794; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 8795; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 8796; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 8797; GFX10-CU-NEXT: s_mov_b32 s6, s4 8798; GFX10-CU-NEXT: s_mov_b32 s7, s5 8799; GFX10-CU-NEXT: s_mov_b32 s11, s12 8800; GFX10-CU-NEXT: s_mov_b32 s10, s13 8801; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 8802; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 8803; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 8804; GFX10-CU-NEXT: s_mov_b32 s7, s10 8805; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 8806; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 8807; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8808; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 8809; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 8810; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 8811; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8812; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 8813; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 8814; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8815; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 8816; GFX10-CU-NEXT: s_endpgm 8817; 8818; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8819; SKIP-CACHE-INV: ; %bb.0: ; %entry 8820; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 8821; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 8822; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 8823; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 8824; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 8825; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 8826; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 8827; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 8828; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 8829; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 8830; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 8831; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 8832; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 8833; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 8834; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 8835; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 8836; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8837; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 8838; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 8839; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 8840; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8841; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 8842; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 8843; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8844; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 8845; SKIP-CACHE-INV-NEXT: s_endpgm 8846; 8847; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8848; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 8849; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8850; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8851; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8852; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8853; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8854; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8855; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8856; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8857; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8858; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8859; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8860; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8861; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 8862; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 8863; 8864; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8865; GFX90A-TGSPLIT: ; %bb.0: ; %entry 8866; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 8867; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 8868; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 8869; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8870; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 8871; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 8872; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8873; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8874; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8875; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 8876; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 8877; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8878; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 8879; GFX90A-TGSPLIT-NEXT: s_endpgm 8880; 8881; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8882; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 8883; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8884; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8885; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8886; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8887; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8888; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8889; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8890; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8891; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8892; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8893; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8894; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8895; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8896; GFX940-NOTTGSPLIT-NEXT: s_endpgm 8897; 8898; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8899; GFX940-TGSPLIT: ; %bb.0: ; %entry 8900; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 8901; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 8902; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 8903; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 8904; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 8905; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 8906; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8907; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 8908; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8909; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 8910; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 8911; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 8912; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 8913; GFX940-TGSPLIT-NEXT: s_endpgm 8914; 8915; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8916; GFX11-WGP: ; %bb.0: ; %entry 8917; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8918; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8919; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8920; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 8921; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 8922; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 8923; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8924; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 8925; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8926; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8927; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8928; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 8929; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 8930; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8931; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 8932; GFX11-WGP-NEXT: s_endpgm 8933; 8934; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8935; GFX11-CU: ; %bb.0: ; %entry 8936; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8937; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8938; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8939; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 8940; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 8941; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 8942; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8943; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 8944; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8945; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8946; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 8947; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 8948; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 8949; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8950; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 8951; GFX11-CU-NEXT: s_endpgm 8952; 8953; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8954; GFX12-WGP: ; %bb.0: ; %entry 8955; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8956; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 8957; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 8958; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 8959; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 8960; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 8961; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8962; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 8963; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8964; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8965; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8966; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 8967; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 8968; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 8969; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 8970; GFX12-WGP-NEXT: s_endpgm 8971; 8972; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: 8973; GFX12-CU: ; %bb.0: ; %entry 8974; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 8975; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 8976; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 8977; GFX12-CU-NEXT: s_wait_kmcnt 0x0 8978; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 8979; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 8980; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 8981; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 8982; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8983; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8984; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 8985; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 8986; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 8987; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 8988; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 8989; GFX12-CU-NEXT: s_endpgm 8990 ptr %out, i32 %in, i32 %old) { 8991entry: 8992 %gep = getelementptr i32, ptr %out, i32 4 8993 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst 8994 %val0 = extractvalue { i32, i1 } %val, 0 8995 store i32 %val0, ptr %out, align 4 8996 ret void 8997} 8998 8999define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( 9000; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9001; GFX7: ; %bb.0: ; %entry 9002; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 9003; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9004; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 9005; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 9006; GFX7-NEXT: s_mov_b64 s[12:13], 16 9007; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9008; GFX7-NEXT: s_mov_b32 s6, s4 9009; GFX7-NEXT: s_mov_b32 s7, s5 9010; GFX7-NEXT: s_mov_b32 s11, s12 9011; GFX7-NEXT: s_mov_b32 s10, s13 9012; GFX7-NEXT: s_add_u32 s6, s6, s11 9013; GFX7-NEXT: s_addc_u32 s10, s7, s10 9014; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9015; GFX7-NEXT: s_mov_b32 s7, s10 9016; GFX7-NEXT: v_mov_b32_e32 v2, s9 9017; GFX7-NEXT: v_mov_b32_e32 v0, s8 9018; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9019; GFX7-NEXT: v_mov_b32_e32 v3, v0 9020; GFX7-NEXT: v_mov_b32_e32 v0, s6 9021; GFX7-NEXT: v_mov_b32_e32 v1, s7 9022; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9023; GFX7-NEXT: v_mov_b32_e32 v0, s4 9024; GFX7-NEXT: v_mov_b32_e32 v1, s5 9025; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9026; GFX7-NEXT: flat_store_dword v[0:1], v2 9027; GFX7-NEXT: s_endpgm 9028; 9029; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9030; GFX10-WGP: ; %bb.0: ; %entry 9031; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 9032; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9033; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 9034; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 9035; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 9036; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9037; GFX10-WGP-NEXT: s_mov_b32 s6, s4 9038; GFX10-WGP-NEXT: s_mov_b32 s7, s5 9039; GFX10-WGP-NEXT: s_mov_b32 s11, s12 9040; GFX10-WGP-NEXT: s_mov_b32 s10, s13 9041; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 9042; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 9043; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9044; GFX10-WGP-NEXT: s_mov_b32 s7, s10 9045; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 9046; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 9047; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9048; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 9049; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9050; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9051; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9052; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9053; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9054; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9055; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9056; GFX10-WGP-NEXT: s_endpgm 9057; 9058; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9059; GFX10-CU: ; %bb.0: ; %entry 9060; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 9061; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9062; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 9063; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 9064; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 9065; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9066; GFX10-CU-NEXT: s_mov_b32 s6, s4 9067; GFX10-CU-NEXT: s_mov_b32 s7, s5 9068; GFX10-CU-NEXT: s_mov_b32 s11, s12 9069; GFX10-CU-NEXT: s_mov_b32 s10, s13 9070; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9071; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9072; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9073; GFX10-CU-NEXT: s_mov_b32 s7, s10 9074; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9075; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9076; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9077; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9079; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9080; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9081; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9082; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9083; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9084; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9085; GFX10-CU-NEXT: s_endpgm 9086; 9087; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9088; SKIP-CACHE-INV: ; %bb.0: ; %entry 9089; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9090; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9091; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9092; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9093; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9094; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9095; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9096; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9097; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9098; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9099; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9100; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9101; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9102; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9103; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9104; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9105; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9106; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9107; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9108; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9109; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9110; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9111; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9112; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9113; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9114; SKIP-CACHE-INV-NEXT: s_endpgm 9115; 9116; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9117; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9118; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9119; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9120; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9121; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9122; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9123; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9124; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9125; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9126; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9127; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9128; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9129; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9130; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9131; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9132; 9133; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9134; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9135; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9136; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9137; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9138; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9139; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9140; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9141; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9142; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9143; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9144; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9145; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9146; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9147; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9148; GFX90A-TGSPLIT-NEXT: s_endpgm 9149; 9150; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9151; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9152; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9153; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9154; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9155; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9156; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9157; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9158; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9159; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9160; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9161; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9162; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9163; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9164; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9165; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9166; 9167; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9168; GFX940-TGSPLIT: ; %bb.0: ; %entry 9169; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9170; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9171; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9172; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9173; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9174; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9175; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9176; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9177; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9178; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9179; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9180; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9181; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9182; GFX940-TGSPLIT-NEXT: s_endpgm 9183; 9184; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9185; GFX11-WGP: ; %bb.0: ; %entry 9186; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9187; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9188; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9189; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9190; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9191; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9192; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9193; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9194; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9195; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9196; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9197; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9198; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9199; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9200; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9201; GFX11-WGP-NEXT: s_endpgm 9202; 9203; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9204; GFX11-CU: ; %bb.0: ; %entry 9205; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9206; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9207; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9208; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9209; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9210; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9211; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9212; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9213; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9214; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9215; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9216; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9217; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9218; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9219; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9220; GFX11-CU-NEXT: s_endpgm 9221; 9222; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9223; GFX12-WGP: ; %bb.0: ; %entry 9224; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9225; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9226; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9227; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9228; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9229; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9230; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9231; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9232; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9233; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9234; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9235; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9236; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9237; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9238; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9239; GFX12-WGP-NEXT: s_endpgm 9240; 9241; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: 9242; GFX12-CU: ; %bb.0: ; %entry 9243; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9244; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9245; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9246; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9247; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9248; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9249; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9250; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9251; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9252; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9253; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9254; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9255; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9256; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 9257; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9258; GFX12-CU-NEXT: s_endpgm 9259 ptr %out, i32 %in, i32 %old) { 9260entry: 9261 %gep = getelementptr i32, ptr %out, i32 4 9262 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst 9263 %val0 = extractvalue { i32, i1 } %val, 0 9264 store i32 %val0, ptr %out, align 4 9265 ret void 9266} 9267 9268define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( 9269; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9270; GFX7: ; %bb.0: ; %entry 9271; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 9272; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9273; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 9274; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 9275; GFX7-NEXT: s_mov_b64 s[12:13], 16 9276; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9277; GFX7-NEXT: s_mov_b32 s6, s4 9278; GFX7-NEXT: s_mov_b32 s7, s5 9279; GFX7-NEXT: s_mov_b32 s11, s12 9280; GFX7-NEXT: s_mov_b32 s10, s13 9281; GFX7-NEXT: s_add_u32 s6, s6, s11 9282; GFX7-NEXT: s_addc_u32 s10, s7, s10 9283; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9284; GFX7-NEXT: s_mov_b32 s7, s10 9285; GFX7-NEXT: v_mov_b32_e32 v2, s9 9286; GFX7-NEXT: v_mov_b32_e32 v0, s8 9287; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9288; GFX7-NEXT: v_mov_b32_e32 v3, v0 9289; GFX7-NEXT: v_mov_b32_e32 v0, s6 9290; GFX7-NEXT: v_mov_b32_e32 v1, s7 9291; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9292; GFX7-NEXT: v_mov_b32_e32 v0, s4 9293; GFX7-NEXT: v_mov_b32_e32 v1, s5 9294; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9295; GFX7-NEXT: flat_store_dword v[0:1], v2 9296; GFX7-NEXT: s_endpgm 9297; 9298; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9299; GFX10-WGP: ; %bb.0: ; %entry 9300; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 9301; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9302; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 9303; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 9304; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 9305; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9306; GFX10-WGP-NEXT: s_mov_b32 s6, s4 9307; GFX10-WGP-NEXT: s_mov_b32 s7, s5 9308; GFX10-WGP-NEXT: s_mov_b32 s11, s12 9309; GFX10-WGP-NEXT: s_mov_b32 s10, s13 9310; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 9311; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 9312; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9313; GFX10-WGP-NEXT: s_mov_b32 s7, s10 9314; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 9315; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 9316; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9317; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 9318; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9319; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9320; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9321; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9322; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9323; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9324; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9325; GFX10-WGP-NEXT: s_endpgm 9326; 9327; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9328; GFX10-CU: ; %bb.0: ; %entry 9329; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 9330; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9331; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 9332; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 9333; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 9334; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9335; GFX10-CU-NEXT: s_mov_b32 s6, s4 9336; GFX10-CU-NEXT: s_mov_b32 s7, s5 9337; GFX10-CU-NEXT: s_mov_b32 s11, s12 9338; GFX10-CU-NEXT: s_mov_b32 s10, s13 9339; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9340; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9341; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9342; GFX10-CU-NEXT: s_mov_b32 s7, s10 9343; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9344; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9345; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9346; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9347; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9348; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9349; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9350; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9351; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9352; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9353; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9354; GFX10-CU-NEXT: s_endpgm 9355; 9356; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9357; SKIP-CACHE-INV: ; %bb.0: ; %entry 9358; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9359; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9360; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9361; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9362; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9363; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9364; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9365; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9366; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9367; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9368; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9369; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9370; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9371; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9372; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9373; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9374; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9375; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9377; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9378; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9379; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9380; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9381; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9382; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9383; SKIP-CACHE-INV-NEXT: s_endpgm 9384; 9385; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9386; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9387; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9388; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9389; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9390; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9391; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9392; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9393; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9394; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9395; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9396; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9397; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9398; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9399; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9400; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9401; 9402; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9403; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9404; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9405; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9406; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9407; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9408; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9409; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9410; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9411; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9412; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9413; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9414; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9415; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9416; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9417; GFX90A-TGSPLIT-NEXT: s_endpgm 9418; 9419; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9420; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9421; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9422; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9423; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9424; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9425; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9426; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9427; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9428; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9429; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9430; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9431; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9432; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9433; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9434; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9435; 9436; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9437; GFX940-TGSPLIT: ; %bb.0: ; %entry 9438; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9439; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9440; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9441; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9442; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9443; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9444; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9445; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9446; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9447; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9448; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9449; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9450; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9451; GFX940-TGSPLIT-NEXT: s_endpgm 9452; 9453; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9454; GFX11-WGP: ; %bb.0: ; %entry 9455; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9456; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9457; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9458; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9459; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9460; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9461; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9462; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9463; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9464; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9465; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9466; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9467; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9468; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9469; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9470; GFX11-WGP-NEXT: s_endpgm 9471; 9472; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9473; GFX11-CU: ; %bb.0: ; %entry 9474; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9475; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9476; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9477; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9478; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9479; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9480; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9481; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9482; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9483; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9484; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9485; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9486; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9487; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9488; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9489; GFX11-CU-NEXT: s_endpgm 9490; 9491; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9492; GFX12-WGP: ; %bb.0: ; %entry 9493; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9494; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9495; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9496; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9497; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9498; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9499; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9500; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9501; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9502; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9503; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9504; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9505; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9506; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9507; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9508; GFX12-WGP-NEXT: s_endpgm 9509; 9510; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: 9511; GFX12-CU: ; %bb.0: ; %entry 9512; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9513; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9514; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9515; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9516; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9517; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9518; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9519; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9520; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9521; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9522; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9523; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9524; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9525; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 9526; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9527; GFX12-CU-NEXT: s_endpgm 9528 ptr %out, i32 %in, i32 %old) { 9529entry: 9530 %gep = getelementptr i32, ptr %out, i32 4 9531 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst 9532 %val0 = extractvalue { i32, i1 } %val, 0 9533 store i32 %val0, ptr %out, align 4 9534 ret void 9535} 9536 9537define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( 9538; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9539; GFX7: ; %bb.0: ; %entry 9540; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 9541; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9542; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 9543; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 9544; GFX7-NEXT: s_mov_b64 s[12:13], 16 9545; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9546; GFX7-NEXT: s_mov_b32 s6, s4 9547; GFX7-NEXT: s_mov_b32 s7, s5 9548; GFX7-NEXT: s_mov_b32 s11, s12 9549; GFX7-NEXT: s_mov_b32 s10, s13 9550; GFX7-NEXT: s_add_u32 s6, s6, s11 9551; GFX7-NEXT: s_addc_u32 s10, s7, s10 9552; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9553; GFX7-NEXT: s_mov_b32 s7, s10 9554; GFX7-NEXT: v_mov_b32_e32 v2, s9 9555; GFX7-NEXT: v_mov_b32_e32 v0, s8 9556; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9557; GFX7-NEXT: v_mov_b32_e32 v3, v0 9558; GFX7-NEXT: v_mov_b32_e32 v0, s6 9559; GFX7-NEXT: v_mov_b32_e32 v1, s7 9560; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9561; GFX7-NEXT: v_mov_b32_e32 v0, s4 9562; GFX7-NEXT: v_mov_b32_e32 v1, s5 9563; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9564; GFX7-NEXT: flat_store_dword v[0:1], v2 9565; GFX7-NEXT: s_endpgm 9566; 9567; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9568; GFX10-WGP: ; %bb.0: ; %entry 9569; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 9570; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9571; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 9572; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 9573; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 9574; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9575; GFX10-WGP-NEXT: s_mov_b32 s6, s4 9576; GFX10-WGP-NEXT: s_mov_b32 s7, s5 9577; GFX10-WGP-NEXT: s_mov_b32 s11, s12 9578; GFX10-WGP-NEXT: s_mov_b32 s10, s13 9579; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 9580; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 9581; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9582; GFX10-WGP-NEXT: s_mov_b32 s7, s10 9583; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 9584; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 9585; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9586; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 9587; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9588; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9589; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9590; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9591; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9592; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9593; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9594; GFX10-WGP-NEXT: s_endpgm 9595; 9596; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9597; GFX10-CU: ; %bb.0: ; %entry 9598; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 9599; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 9600; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 9601; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 9602; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 9603; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9604; GFX10-CU-NEXT: s_mov_b32 s6, s4 9605; GFX10-CU-NEXT: s_mov_b32 s7, s5 9606; GFX10-CU-NEXT: s_mov_b32 s11, s12 9607; GFX10-CU-NEXT: s_mov_b32 s10, s13 9608; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 9609; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 9610; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 9611; GFX10-CU-NEXT: s_mov_b32 s7, s10 9612; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 9613; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 9614; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9615; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 9616; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9617; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9618; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9619; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9620; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9621; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9622; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9623; GFX10-CU-NEXT: s_endpgm 9624; 9625; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9626; SKIP-CACHE-INV: ; %bb.0: ; %entry 9627; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 9628; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 9629; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 9630; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 9631; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 9632; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9633; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 9634; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 9635; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 9636; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 9637; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 9638; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 9639; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 9640; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 9641; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 9642; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 9643; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9644; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 9645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9647; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 9648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9649; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9650; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9651; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9652; SKIP-CACHE-INV-NEXT: s_endpgm 9653; 9654; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9655; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9656; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9657; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9658; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9659; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9660; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9661; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9662; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9663; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9664; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9665; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9666; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9667; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9668; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9669; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9670; 9671; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9672; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9673; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 9674; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 9675; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 9676; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9677; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 9678; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 9679; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9680; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9681; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9682; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 9683; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9684; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9685; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9686; GFX90A-TGSPLIT-NEXT: s_endpgm 9687; 9688; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9689; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9690; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9691; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9692; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9693; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9694; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9695; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9696; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9697; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9698; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9699; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9700; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9701; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9702; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9703; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9704; 9705; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9706; GFX940-TGSPLIT: ; %bb.0: ; %entry 9707; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 9708; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 9709; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 9710; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9711; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 9712; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 9713; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9714; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 9715; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9716; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 9717; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9718; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9719; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9720; GFX940-TGSPLIT-NEXT: s_endpgm 9721; 9722; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9723; GFX11-WGP: ; %bb.0: ; %entry 9724; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9725; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9726; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9727; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9728; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 9729; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9730; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9731; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 9732; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9733; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9734; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9735; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9736; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9737; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9738; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9739; GFX11-WGP-NEXT: s_endpgm 9740; 9741; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9742; GFX11-CU: ; %bb.0: ; %entry 9743; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9744; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9745; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9746; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9747; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 9748; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9749; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9750; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 9751; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9752; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9753; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 9754; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9755; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9756; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9757; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9758; GFX11-CU-NEXT: s_endpgm 9759; 9760; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9761; GFX12-WGP: ; %bb.0: ; %entry 9762; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9763; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 9764; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 9765; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9766; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 9767; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9768; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9769; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 9770; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9771; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9772; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9773; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9774; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9775; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9776; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9777; GFX12-WGP-NEXT: s_endpgm 9778; 9779; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: 9780; GFX12-CU: ; %bb.0: ; %entry 9781; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 9782; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 9783; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 9784; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9785; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 9786; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9787; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 9788; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 9789; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9790; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9791; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 9792; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9793; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9794; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 9795; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9796; GFX12-CU-NEXT: s_endpgm 9797 ptr %out, i32 %in, i32 %old) { 9798entry: 9799 %gep = getelementptr i32, ptr %out, i32 4 9800 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst 9801 %val0 = extractvalue { i32, i1 } %val, 0 9802 store i32 %val0, ptr %out, align 4 9803 ret void 9804} 9805 9806define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( 9807; GFX7-LABEL: flat_singlethread_one_as_unordered_load: 9808; GFX7: ; %bb.0: ; %entry 9809; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9810; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 9811; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9812; GFX7-NEXT: v_mov_b32_e32 v0, s6 9813; GFX7-NEXT: v_mov_b32_e32 v1, s7 9814; GFX7-NEXT: flat_load_dword v2, v[0:1] 9815; GFX7-NEXT: v_mov_b32_e32 v0, s4 9816; GFX7-NEXT: v_mov_b32_e32 v1, s5 9817; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9818; GFX7-NEXT: flat_store_dword v[0:1], v2 9819; GFX7-NEXT: s_endpgm 9820; 9821; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: 9822; GFX10-WGP: ; %bb.0: ; %entry 9823; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9824; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9825; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9826; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9827; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9828; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 9829; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9830; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9831; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9832; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 9833; GFX10-WGP-NEXT: s_endpgm 9834; 9835; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: 9836; GFX10-CU: ; %bb.0: ; %entry 9837; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9838; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9839; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 9840; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 9841; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 9842; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 9843; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 9844; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 9845; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9846; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 9847; GFX10-CU-NEXT: s_endpgm 9848; 9849; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: 9850; SKIP-CACHE-INV: ; %bb.0: ; %entry 9851; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9852; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 9853; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 9854; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 9855; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 9856; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 9857; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 9858; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 9859; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9860; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 9861; SKIP-CACHE-INV-NEXT: s_endpgm 9862; 9863; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 9864; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 9865; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9866; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9867; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9868; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 9869; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9870; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9871; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9872; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 9873; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 9874; 9875; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 9876; GFX90A-TGSPLIT: ; %bb.0: ; %entry 9877; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9878; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9879; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9880; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 9881; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9882; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 9883; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9884; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 9885; GFX90A-TGSPLIT-NEXT: s_endpgm 9886; 9887; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 9888; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 9889; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9890; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 9891; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9892; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9893; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9894; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9895; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9896; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9897; GFX940-NOTTGSPLIT-NEXT: s_endpgm 9898; 9899; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: 9900; GFX940-TGSPLIT: ; %bb.0: ; %entry 9901; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 9902; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 9903; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 9904; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 9905; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 9906; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 9907; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 9908; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 9909; GFX940-TGSPLIT-NEXT: s_endpgm 9910; 9911; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load: 9912; GFX11-WGP: ; %bb.0: ; %entry 9913; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9914; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9915; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 9916; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 9917; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 9918; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 9919; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 9920; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 9921; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9922; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 9923; GFX11-WGP-NEXT: s_endpgm 9924; 9925; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load: 9926; GFX11-CU: ; %bb.0: ; %entry 9927; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9928; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9929; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 9930; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 9931; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 9932; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 9933; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 9934; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 9935; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9936; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 9937; GFX11-CU-NEXT: s_endpgm 9938; 9939; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load: 9940; GFX12-WGP: ; %bb.0: ; %entry 9941; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9942; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9943; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 9944; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 9945; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 9946; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 9947; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 9948; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 9949; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 9950; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 9951; GFX12-WGP-NEXT: s_endpgm 9952; 9953; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load: 9954; GFX12-CU: ; %bb.0: ; %entry 9955; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 9956; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 9957; GFX12-CU-NEXT: s_wait_kmcnt 0x0 9958; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 9959; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 9960; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 9961; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 9962; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 9963; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 9964; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 9965; GFX12-CU-NEXT: s_endpgm 9966 ptr %in, ptr %out) { 9967entry: 9968 %val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4 9969 store i32 %val, ptr %out 9970 ret void 9971} 9972 9973define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( 9974; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: 9975; GFX7: ; %bb.0: ; %entry 9976; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9977; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 9978; GFX7-NEXT: s_waitcnt lgkmcnt(0) 9979; GFX7-NEXT: v_mov_b32_e32 v0, s6 9980; GFX7-NEXT: v_mov_b32_e32 v1, s7 9981; GFX7-NEXT: flat_load_dword v2, v[0:1] 9982; GFX7-NEXT: v_mov_b32_e32 v0, s4 9983; GFX7-NEXT: v_mov_b32_e32 v1, s5 9984; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9985; GFX7-NEXT: flat_store_dword v[0:1], v2 9986; GFX7-NEXT: s_endpgm 9987; 9988; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 9989; GFX10-WGP: ; %bb.0: ; %entry 9990; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 9991; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 9992; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 9993; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 9994; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 9995; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 9996; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 9997; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 9998; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9999; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10000; GFX10-WGP-NEXT: s_endpgm 10001; 10002; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: 10003; GFX10-CU: ; %bb.0: ; %entry 10004; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10005; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10006; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10007; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10008; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10009; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10010; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10011; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10012; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10013; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10014; GFX10-CU-NEXT: s_endpgm 10015; 10016; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: 10017; SKIP-CACHE-INV: ; %bb.0: ; %entry 10018; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10019; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10020; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10021; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10022; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10023; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10024; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10025; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10026; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10027; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10028; SKIP-CACHE-INV-NEXT: s_endpgm 10029; 10030; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 10031; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10032; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10033; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10034; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10035; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10036; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10037; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10038; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10039; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10040; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10041; 10042; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 10043; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10044; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10045; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10046; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10047; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10048; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10049; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10050; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10051; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10052; GFX90A-TGSPLIT-NEXT: s_endpgm 10053; 10054; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 10055; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10056; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10057; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10058; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10059; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10060; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10061; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10062; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10063; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10064; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10065; 10066; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: 10067; GFX940-TGSPLIT: ; %bb.0: ; %entry 10068; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10069; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10070; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10071; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10072; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10073; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10074; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10075; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10076; GFX940-TGSPLIT-NEXT: s_endpgm 10077; 10078; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 10079; GFX11-WGP: ; %bb.0: ; %entry 10080; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10081; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10082; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10083; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10084; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10085; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 10086; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10087; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10088; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10089; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10090; GFX11-WGP-NEXT: s_endpgm 10091; 10092; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load: 10093; GFX11-CU: ; %bb.0: ; %entry 10094; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10095; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10096; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10097; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10098; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10099; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10100; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10101; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10102; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10103; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10104; GFX11-CU-NEXT: s_endpgm 10105; 10106; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load: 10107; GFX12-WGP: ; %bb.0: ; %entry 10108; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10109; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10110; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10111; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10112; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10113; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 10114; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10115; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10116; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 10117; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10118; GFX12-WGP-NEXT: s_endpgm 10119; 10120; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load: 10121; GFX12-CU: ; %bb.0: ; %entry 10122; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10123; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10124; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10125; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10126; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10127; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10128; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10129; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10130; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10131; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10132; GFX12-CU-NEXT: s_endpgm 10133 ptr %in, ptr %out) { 10134entry: 10135 %val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4 10136 store i32 %val, ptr %out 10137 ret void 10138} 10139 10140define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( 10141; GFX7-LABEL: flat_singlethread_one_as_acquire_load: 10142; GFX7: ; %bb.0: ; %entry 10143; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10144; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 10145; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10146; GFX7-NEXT: v_mov_b32_e32 v0, s6 10147; GFX7-NEXT: v_mov_b32_e32 v1, s7 10148; GFX7-NEXT: flat_load_dword v2, v[0:1] 10149; GFX7-NEXT: v_mov_b32_e32 v0, s4 10150; GFX7-NEXT: v_mov_b32_e32 v1, s5 10151; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10152; GFX7-NEXT: flat_store_dword v[0:1], v2 10153; GFX7-NEXT: s_endpgm 10154; 10155; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: 10156; GFX10-WGP: ; %bb.0: ; %entry 10157; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10158; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10159; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10160; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10161; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10162; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 10163; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10164; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10165; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10166; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10167; GFX10-WGP-NEXT: s_endpgm 10168; 10169; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: 10170; GFX10-CU: ; %bb.0: ; %entry 10171; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10172; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10173; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10174; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10175; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10176; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10177; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10178; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10179; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10180; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10181; GFX10-CU-NEXT: s_endpgm 10182; 10183; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: 10184; SKIP-CACHE-INV: ; %bb.0: ; %entry 10185; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10186; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10187; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10188; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10189; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10190; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10191; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10192; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10193; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10194; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10195; SKIP-CACHE-INV-NEXT: s_endpgm 10196; 10197; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 10198; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10199; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10200; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10201; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10202; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10203; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10204; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10205; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10206; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10207; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10208; 10209; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 10210; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10211; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10212; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10213; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10214; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10215; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10216; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10217; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10218; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10219; GFX90A-TGSPLIT-NEXT: s_endpgm 10220; 10221; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 10222; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10223; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10224; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10225; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10226; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10227; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10228; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10229; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10230; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10231; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10232; 10233; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: 10234; GFX940-TGSPLIT: ; %bb.0: ; %entry 10235; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10236; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10237; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10238; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10239; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10240; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10241; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10242; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10243; GFX940-TGSPLIT-NEXT: s_endpgm 10244; 10245; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load: 10246; GFX11-WGP: ; %bb.0: ; %entry 10247; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10248; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10249; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10250; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10251; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10252; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 10253; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10254; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10255; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10256; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10257; GFX11-WGP-NEXT: s_endpgm 10258; 10259; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load: 10260; GFX11-CU: ; %bb.0: ; %entry 10261; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10262; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10263; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10264; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10265; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10266; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10267; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10268; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10269; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10270; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10271; GFX11-CU-NEXT: s_endpgm 10272; 10273; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load: 10274; GFX12-WGP: ; %bb.0: ; %entry 10275; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10276; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10277; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10278; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10279; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10280; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 10281; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10282; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10283; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 10284; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10285; GFX12-WGP-NEXT: s_endpgm 10286; 10287; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load: 10288; GFX12-CU: ; %bb.0: ; %entry 10289; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10290; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10291; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10292; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10293; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10294; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10295; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10296; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10297; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10298; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10299; GFX12-CU-NEXT: s_endpgm 10300 ptr %in, ptr %out) { 10301entry: 10302 %val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4 10303 store i32 %val, ptr %out 10304 ret void 10305} 10306 10307define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( 10308; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: 10309; GFX7: ; %bb.0: ; %entry 10310; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10311; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 10312; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10313; GFX7-NEXT: v_mov_b32_e32 v0, s6 10314; GFX7-NEXT: v_mov_b32_e32 v1, s7 10315; GFX7-NEXT: flat_load_dword v2, v[0:1] 10316; GFX7-NEXT: v_mov_b32_e32 v0, s4 10317; GFX7-NEXT: v_mov_b32_e32 v1, s5 10318; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10319; GFX7-NEXT: flat_store_dword v[0:1], v2 10320; GFX7-NEXT: s_endpgm 10321; 10322; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 10323; GFX10-WGP: ; %bb.0: ; %entry 10324; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10325; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10326; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10327; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10328; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10329; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 10330; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 10331; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 10332; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10333; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10334; GFX10-WGP-NEXT: s_endpgm 10335; 10336; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 10337; GFX10-CU: ; %bb.0: ; %entry 10338; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10339; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10340; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10341; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10342; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10343; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 10344; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 10345; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 10346; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10347; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10348; GFX10-CU-NEXT: s_endpgm 10349; 10350; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: 10351; SKIP-CACHE-INV: ; %bb.0: ; %entry 10352; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10353; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 10354; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10355; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10357; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] 10358; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 10359; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 10360; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10361; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10362; SKIP-CACHE-INV-NEXT: s_endpgm 10363; 10364; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 10365; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10366; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10367; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10368; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10369; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10370; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10371; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10372; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10373; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10374; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10375; 10376; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 10377; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10378; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 10379; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 10380; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10381; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10382; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10383; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 10384; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10385; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10386; GFX90A-TGSPLIT-NEXT: s_endpgm 10387; 10388; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 10389; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10390; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10391; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10392; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10393; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10394; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10395; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10396; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10397; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10398; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10399; 10400; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: 10401; GFX940-TGSPLIT: ; %bb.0: ; %entry 10402; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 10403; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 10404; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10405; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10406; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] 10407; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 10408; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 10409; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10410; GFX940-TGSPLIT-NEXT: s_endpgm 10411; 10412; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 10413; GFX11-WGP: ; %bb.0: ; %entry 10414; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10415; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10416; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10417; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10418; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10419; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] 10420; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 10421; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 10422; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10423; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10424; GFX11-WGP-NEXT: s_endpgm 10425; 10426; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 10427; GFX11-CU: ; %bb.0: ; %entry 10428; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10429; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10430; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10431; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10432; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10433; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] 10434; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 10435; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 10436; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10437; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10438; GFX11-CU-NEXT: s_endpgm 10439; 10440; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: 10441; GFX12-WGP: ; %bb.0: ; %entry 10442; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10443; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10444; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10445; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10446; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10447; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] 10448; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 10449; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 10450; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 10451; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10452; GFX12-WGP-NEXT: s_endpgm 10453; 10454; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load: 10455; GFX12-CU: ; %bb.0: ; %entry 10456; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 10457; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 10458; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10459; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10460; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10461; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] 10462; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 10463; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 10464; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 10465; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10466; GFX12-CU-NEXT: s_endpgm 10467 ptr %in, ptr %out) { 10468entry: 10469 %val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4 10470 store i32 %val, ptr %out 10471 ret void 10472} 10473 10474define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( 10475; GFX7-LABEL: flat_singlethread_one_as_unordered_store: 10476; GFX7: ; %bb.0: ; %entry 10477; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10478; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10479; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10480; GFX7-NEXT: v_mov_b32_e32 v0, s6 10481; GFX7-NEXT: v_mov_b32_e32 v1, s7 10482; GFX7-NEXT: v_mov_b32_e32 v2, s4 10483; GFX7-NEXT: flat_store_dword v[0:1], v2 10484; GFX7-NEXT: s_endpgm 10485; 10486; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: 10487; GFX10-WGP: ; %bb.0: ; %entry 10488; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10489; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10490; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10491; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10492; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10493; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10494; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10495; GFX10-WGP-NEXT: s_endpgm 10496; 10497; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: 10498; GFX10-CU: ; %bb.0: ; %entry 10499; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10500; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10501; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10502; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10503; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10504; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10505; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10506; GFX10-CU-NEXT: s_endpgm 10507; 10508; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: 10509; SKIP-CACHE-INV: ; %bb.0: ; %entry 10510; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10511; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10512; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10514; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10515; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10516; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10517; SKIP-CACHE-INV-NEXT: s_endpgm 10518; 10519; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 10520; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10521; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10522; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10523; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10524; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10525; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10526; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10527; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10528; 10529; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 10530; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10531; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10532; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10533; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10534; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10535; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10536; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10537; GFX90A-TGSPLIT-NEXT: s_endpgm 10538; 10539; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 10540; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10541; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10542; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10543; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10544; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10545; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10546; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10547; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10548; 10549; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: 10550; GFX940-TGSPLIT: ; %bb.0: ; %entry 10551; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10552; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10553; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10554; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10555; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10556; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10557; GFX940-TGSPLIT-NEXT: s_endpgm 10558; 10559; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store: 10560; GFX11-WGP: ; %bb.0: ; %entry 10561; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10562; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10563; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10564; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10565; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10566; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10567; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10568; GFX11-WGP-NEXT: s_endpgm 10569; 10570; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store: 10571; GFX11-CU: ; %bb.0: ; %entry 10572; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10573; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10574; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10575; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10576; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10577; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10578; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10579; GFX11-CU-NEXT: s_endpgm 10580; 10581; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_store: 10582; GFX12-WGP: ; %bb.0: ; %entry 10583; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10584; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10585; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10586; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10587; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10588; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10589; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10590; GFX12-WGP-NEXT: s_endpgm 10591; 10592; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_store: 10593; GFX12-CU: ; %bb.0: ; %entry 10594; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10595; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10596; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10597; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10598; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10599; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10600; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10601; GFX12-CU-NEXT: s_endpgm 10602 i32 %in, ptr %out) { 10603entry: 10604 store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 10605 ret void 10606} 10607 10608define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( 10609; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: 10610; GFX7: ; %bb.0: ; %entry 10611; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10612; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10613; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10614; GFX7-NEXT: v_mov_b32_e32 v0, s6 10615; GFX7-NEXT: v_mov_b32_e32 v1, s7 10616; GFX7-NEXT: v_mov_b32_e32 v2, s4 10617; GFX7-NEXT: flat_store_dword v[0:1], v2 10618; GFX7-NEXT: s_endpgm 10619; 10620; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 10621; GFX10-WGP: ; %bb.0: ; %entry 10622; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10623; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10624; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10625; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10626; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10627; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10628; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10629; GFX10-WGP-NEXT: s_endpgm 10630; 10631; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: 10632; GFX10-CU: ; %bb.0: ; %entry 10633; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10634; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10635; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10636; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10637; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10638; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10639; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10640; GFX10-CU-NEXT: s_endpgm 10641; 10642; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: 10643; SKIP-CACHE-INV: ; %bb.0: ; %entry 10644; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10645; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10646; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10649; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10650; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10651; SKIP-CACHE-INV-NEXT: s_endpgm 10652; 10653; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 10654; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10655; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10656; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10657; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10658; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10659; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10660; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10661; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10662; 10663; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 10664; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10665; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10666; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10667; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10668; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10669; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10670; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10671; GFX90A-TGSPLIT-NEXT: s_endpgm 10672; 10673; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 10674; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10675; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10676; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10677; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10678; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10679; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10680; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10681; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10682; 10683; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: 10684; GFX940-TGSPLIT: ; %bb.0: ; %entry 10685; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10686; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10687; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10688; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10689; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10690; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10691; GFX940-TGSPLIT-NEXT: s_endpgm 10692; 10693; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 10694; GFX11-WGP: ; %bb.0: ; %entry 10695; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10696; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10697; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10698; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10699; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10700; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10701; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10702; GFX11-WGP-NEXT: s_endpgm 10703; 10704; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store: 10705; GFX11-CU: ; %bb.0: ; %entry 10706; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10707; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10708; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10709; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10710; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10711; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10712; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10713; GFX11-CU-NEXT: s_endpgm 10714; 10715; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_store: 10716; GFX12-WGP: ; %bb.0: ; %entry 10717; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10718; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10719; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10720; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10721; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10722; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10723; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10724; GFX12-WGP-NEXT: s_endpgm 10725; 10726; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_store: 10727; GFX12-CU: ; %bb.0: ; %entry 10728; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10729; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10730; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10731; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10732; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10733; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10734; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10735; GFX12-CU-NEXT: s_endpgm 10736 i32 %in, ptr %out) { 10737entry: 10738 store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 10739 ret void 10740} 10741 10742define amdgpu_kernel void @flat_singlethread_one_as_release_store( 10743; GFX7-LABEL: flat_singlethread_one_as_release_store: 10744; GFX7: ; %bb.0: ; %entry 10745; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10746; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10747; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10748; GFX7-NEXT: v_mov_b32_e32 v0, s6 10749; GFX7-NEXT: v_mov_b32_e32 v1, s7 10750; GFX7-NEXT: v_mov_b32_e32 v2, s4 10751; GFX7-NEXT: flat_store_dword v[0:1], v2 10752; GFX7-NEXT: s_endpgm 10753; 10754; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: 10755; GFX10-WGP: ; %bb.0: ; %entry 10756; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10757; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10758; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10759; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10760; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10761; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10762; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10763; GFX10-WGP-NEXT: s_endpgm 10764; 10765; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: 10766; GFX10-CU: ; %bb.0: ; %entry 10767; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10768; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10769; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10770; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10771; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10772; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10773; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10774; GFX10-CU-NEXT: s_endpgm 10775; 10776; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: 10777; SKIP-CACHE-INV: ; %bb.0: ; %entry 10778; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10779; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10780; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10784; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10785; SKIP-CACHE-INV-NEXT: s_endpgm 10786; 10787; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: 10788; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10789; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10790; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10791; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10792; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10793; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10794; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10795; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10796; 10797; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: 10798; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10799; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10800; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10801; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10802; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10803; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10804; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10805; GFX90A-TGSPLIT-NEXT: s_endpgm 10806; 10807; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: 10808; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10809; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10810; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10811; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10812; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10813; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10814; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10815; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10816; 10817; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: 10818; GFX940-TGSPLIT: ; %bb.0: ; %entry 10819; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10820; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10821; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10822; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10823; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10824; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10825; GFX940-TGSPLIT-NEXT: s_endpgm 10826; 10827; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store: 10828; GFX11-WGP: ; %bb.0: ; %entry 10829; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10830; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10831; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10832; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10833; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10834; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10835; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10836; GFX11-WGP-NEXT: s_endpgm 10837; 10838; GFX11-CU-LABEL: flat_singlethread_one_as_release_store: 10839; GFX11-CU: ; %bb.0: ; %entry 10840; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10841; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10842; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10843; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10844; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10845; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10846; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10847; GFX11-CU-NEXT: s_endpgm 10848; 10849; GFX12-WGP-LABEL: flat_singlethread_one_as_release_store: 10850; GFX12-WGP: ; %bb.0: ; %entry 10851; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10852; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10853; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10854; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10855; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10856; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10857; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10858; GFX12-WGP-NEXT: s_endpgm 10859; 10860; GFX12-CU-LABEL: flat_singlethread_one_as_release_store: 10861; GFX12-CU: ; %bb.0: ; %entry 10862; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10863; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10864; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10865; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 10866; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 10867; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 10868; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 10869; GFX12-CU-NEXT: s_endpgm 10870 i32 %in, ptr %out) { 10871entry: 10872 store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 10873 ret void 10874} 10875 10876define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( 10877; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: 10878; GFX7: ; %bb.0: ; %entry 10879; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 10880; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 10881; GFX7-NEXT: s_waitcnt lgkmcnt(0) 10882; GFX7-NEXT: v_mov_b32_e32 v0, s6 10883; GFX7-NEXT: v_mov_b32_e32 v1, s7 10884; GFX7-NEXT: v_mov_b32_e32 v2, s4 10885; GFX7-NEXT: flat_store_dword v[0:1], v2 10886; GFX7-NEXT: s_endpgm 10887; 10888; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 10889; GFX10-WGP: ; %bb.0: ; %entry 10890; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 10891; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10892; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 10893; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 10894; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 10895; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 10896; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 10897; GFX10-WGP-NEXT: s_endpgm 10898; 10899; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 10900; GFX10-CU: ; %bb.0: ; %entry 10901; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 10902; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10903; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 10904; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 10905; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 10906; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 10907; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 10908; GFX10-CU-NEXT: s_endpgm 10909; 10910; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: 10911; SKIP-CACHE-INV: ; %bb.0: ; %entry 10912; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 10913; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 10914; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 10915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 10916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 10917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 10918; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 10919; SKIP-CACHE-INV-NEXT: s_endpgm 10920; 10921; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 10922; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 10923; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10924; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10925; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10926; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10927; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10928; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 10929; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 10930; 10931; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 10932; GFX90A-TGSPLIT: ; %bb.0: ; %entry 10933; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 10934; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 10935; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10936; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 10937; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 10938; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 10939; GFX90A-TGSPLIT-NEXT: s_endpgm 10940; 10941; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 10942; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 10943; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10944; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10945; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10946; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10947; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10948; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10949; GFX940-NOTTGSPLIT-NEXT: s_endpgm 10950; 10951; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: 10952; GFX940-TGSPLIT: ; %bb.0: ; %entry 10953; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 10954; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 10955; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 10956; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 10957; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 10958; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 10959; GFX940-TGSPLIT-NEXT: s_endpgm 10960; 10961; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 10962; GFX11-WGP: ; %bb.0: ; %entry 10963; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10964; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10965; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 10966; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 10967; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 10968; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 10969; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 10970; GFX11-WGP-NEXT: s_endpgm 10971; 10972; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 10973; GFX11-CU: ; %bb.0: ; %entry 10974; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10975; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10976; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 10977; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 10978; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 10979; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 10980; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 10981; GFX11-CU-NEXT: s_endpgm 10982; 10983; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: 10984; GFX12-WGP: ; %bb.0: ; %entry 10985; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 10986; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10987; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 10988; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 10989; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 10990; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 10991; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 10992; GFX12-WGP-NEXT: s_endpgm 10993; 10994; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_store: 10995; GFX12-CU: ; %bb.0: ; %entry 10996; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 10997; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 10998; GFX12-CU-NEXT: s_wait_kmcnt 0x0 10999; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11000; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11001; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11002; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 11003; GFX12-CU-NEXT: s_endpgm 11004 i32 %in, ptr %out) { 11005entry: 11006 store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 11007 ret void 11008} 11009 11010define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( 11011; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11012; GFX7: ; %bb.0: ; %entry 11013; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11014; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11015; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11016; GFX7-NEXT: v_mov_b32_e32 v0, s6 11017; GFX7-NEXT: v_mov_b32_e32 v1, s7 11018; GFX7-NEXT: v_mov_b32_e32 v2, s4 11019; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11020; GFX7-NEXT: s_endpgm 11021; 11022; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11023; GFX10-WGP: ; %bb.0: ; %entry 11024; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11025; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11026; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11027; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11028; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11029; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11030; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11031; GFX10-WGP-NEXT: s_endpgm 11032; 11033; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11034; GFX10-CU: ; %bb.0: ; %entry 11035; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11036; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11037; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11038; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11039; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11040; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11041; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11042; GFX10-CU-NEXT: s_endpgm 11043; 11044; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11045; SKIP-CACHE-INV: ; %bb.0: ; %entry 11046; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11047; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11048; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11049; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11050; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11051; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11052; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11053; SKIP-CACHE-INV-NEXT: s_endpgm 11054; 11055; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11056; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11057; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11058; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11059; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11060; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11061; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11062; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11063; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11064; 11065; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11066; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11067; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11068; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11069; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11070; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11071; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11072; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11073; GFX90A-TGSPLIT-NEXT: s_endpgm 11074; 11075; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11076; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11077; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11078; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11079; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11080; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11081; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11082; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11083; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11084; 11085; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11086; GFX940-TGSPLIT: ; %bb.0: ; %entry 11087; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11088; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11089; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11090; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11091; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11092; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11093; GFX940-TGSPLIT-NEXT: s_endpgm 11094; 11095; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11096; GFX11-WGP: ; %bb.0: ; %entry 11097; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11098; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11099; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11100; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11101; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11102; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11103; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11104; GFX11-WGP-NEXT: s_endpgm 11105; 11106; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11107; GFX11-CU: ; %bb.0: ; %entry 11108; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11109; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11110; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11111; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11112; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11113; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11114; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11115; GFX11-CU-NEXT: s_endpgm 11116; 11117; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11118; GFX12-WGP: ; %bb.0: ; %entry 11119; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11120; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11121; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11122; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11123; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11124; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11125; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11126; GFX12-WGP-NEXT: s_endpgm 11127; 11128; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: 11129; GFX12-CU: ; %bb.0: ; %entry 11130; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11131; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11132; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11133; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11134; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11135; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11136; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11137; GFX12-CU-NEXT: s_endpgm 11138 ptr %out, i32 %in) { 11139entry: 11140 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic 11141 ret void 11142} 11143 11144define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( 11145; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11146; GFX7: ; %bb.0: ; %entry 11147; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11148; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11149; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11150; GFX7-NEXT: v_mov_b32_e32 v0, s6 11151; GFX7-NEXT: v_mov_b32_e32 v1, s7 11152; GFX7-NEXT: v_mov_b32_e32 v2, s4 11153; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11154; GFX7-NEXT: s_endpgm 11155; 11156; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11157; GFX10-WGP: ; %bb.0: ; %entry 11158; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11159; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11160; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11161; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11162; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11163; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11164; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11165; GFX10-WGP-NEXT: s_endpgm 11166; 11167; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11168; GFX10-CU: ; %bb.0: ; %entry 11169; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11170; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11171; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11172; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11173; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11174; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11175; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11176; GFX10-CU-NEXT: s_endpgm 11177; 11178; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11179; SKIP-CACHE-INV: ; %bb.0: ; %entry 11180; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11181; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11182; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11183; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11184; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11185; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11186; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11187; SKIP-CACHE-INV-NEXT: s_endpgm 11188; 11189; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11190; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11191; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11192; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11193; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11194; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11195; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11196; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11197; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11198; 11199; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11200; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11201; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11202; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11203; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11204; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11205; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11206; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11207; GFX90A-TGSPLIT-NEXT: s_endpgm 11208; 11209; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11210; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11211; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11212; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11213; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11214; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11215; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11216; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11217; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11218; 11219; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11220; GFX940-TGSPLIT: ; %bb.0: ; %entry 11221; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11222; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11223; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11224; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11225; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11226; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11227; GFX940-TGSPLIT-NEXT: s_endpgm 11228; 11229; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11230; GFX11-WGP: ; %bb.0: ; %entry 11231; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11232; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11233; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11234; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11235; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11236; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11237; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11238; GFX11-WGP-NEXT: s_endpgm 11239; 11240; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11241; GFX11-CU: ; %bb.0: ; %entry 11242; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11243; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11244; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11245; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11246; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11247; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11248; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11249; GFX11-CU-NEXT: s_endpgm 11250; 11251; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11252; GFX12-WGP: ; %bb.0: ; %entry 11253; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11254; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11255; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11256; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11257; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11258; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11259; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11260; GFX12-WGP-NEXT: s_endpgm 11261; 11262; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: 11263; GFX12-CU: ; %bb.0: ; %entry 11264; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11265; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11266; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11267; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11268; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11269; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11270; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11271; GFX12-CU-NEXT: s_endpgm 11272 ptr %out, i32 %in) { 11273entry: 11274 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire 11275 ret void 11276} 11277 11278define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( 11279; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: 11280; GFX7: ; %bb.0: ; %entry 11281; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11282; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11283; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11284; GFX7-NEXT: v_mov_b32_e32 v0, s6 11285; GFX7-NEXT: v_mov_b32_e32 v1, s7 11286; GFX7-NEXT: v_mov_b32_e32 v2, s4 11287; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11288; GFX7-NEXT: s_endpgm 11289; 11290; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 11291; GFX10-WGP: ; %bb.0: ; %entry 11292; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11293; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11294; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11295; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11296; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11297; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11298; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11299; GFX10-WGP-NEXT: s_endpgm 11300; 11301; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 11302; GFX10-CU: ; %bb.0: ; %entry 11303; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11304; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11305; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11306; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11307; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11308; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11309; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11310; GFX10-CU-NEXT: s_endpgm 11311; 11312; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: 11313; SKIP-CACHE-INV: ; %bb.0: ; %entry 11314; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11315; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11316; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11317; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11318; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11319; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11320; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11321; SKIP-CACHE-INV-NEXT: s_endpgm 11322; 11323; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 11324; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11325; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11326; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11327; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11328; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11329; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11330; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11331; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11332; 11333; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 11334; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11335; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11336; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11337; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11338; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11339; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11340; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11341; GFX90A-TGSPLIT-NEXT: s_endpgm 11342; 11343; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 11344; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11345; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11346; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11347; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11348; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11349; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11350; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11351; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11352; 11353; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: 11354; GFX940-TGSPLIT: ; %bb.0: ; %entry 11355; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11356; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11357; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11358; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11359; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11360; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11361; GFX940-TGSPLIT-NEXT: s_endpgm 11362; 11363; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 11364; GFX11-WGP: ; %bb.0: ; %entry 11365; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11366; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11367; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11368; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11369; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11370; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11371; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11372; GFX11-WGP-NEXT: s_endpgm 11373; 11374; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 11375; GFX11-CU: ; %bb.0: ; %entry 11376; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11377; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11378; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11379; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11380; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11381; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11382; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11383; GFX11-CU-NEXT: s_endpgm 11384; 11385; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: 11386; GFX12-WGP: ; %bb.0: ; %entry 11387; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11388; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11389; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11390; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11391; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11392; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11393; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11394; GFX12-WGP-NEXT: s_endpgm 11395; 11396; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: 11397; GFX12-CU: ; %bb.0: ; %entry 11398; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11399; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11400; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11401; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11402; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11403; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11404; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11405; GFX12-CU-NEXT: s_endpgm 11406 ptr %out, i32 %in) { 11407entry: 11408 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release 11409 ret void 11410} 11411 11412define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( 11413; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11414; GFX7: ; %bb.0: ; %entry 11415; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11416; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11417; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11418; GFX7-NEXT: v_mov_b32_e32 v0, s6 11419; GFX7-NEXT: v_mov_b32_e32 v1, s7 11420; GFX7-NEXT: v_mov_b32_e32 v2, s4 11421; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11422; GFX7-NEXT: s_endpgm 11423; 11424; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11425; GFX10-WGP: ; %bb.0: ; %entry 11426; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11427; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11428; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11429; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11430; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11431; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11432; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11433; GFX10-WGP-NEXT: s_endpgm 11434; 11435; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11436; GFX10-CU: ; %bb.0: ; %entry 11437; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11438; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11439; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11440; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11441; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11442; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11443; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11444; GFX10-CU-NEXT: s_endpgm 11445; 11446; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11447; SKIP-CACHE-INV: ; %bb.0: ; %entry 11448; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11449; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11450; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11451; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11452; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11453; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11454; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11455; SKIP-CACHE-INV-NEXT: s_endpgm 11456; 11457; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11458; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11459; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11460; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11461; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11462; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11463; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11464; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11465; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11466; 11467; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11468; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11469; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11470; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11471; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11472; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11473; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11474; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11475; GFX90A-TGSPLIT-NEXT: s_endpgm 11476; 11477; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11478; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11479; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11480; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11481; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11482; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11483; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11484; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11485; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11486; 11487; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11488; GFX940-TGSPLIT: ; %bb.0: ; %entry 11489; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11490; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11491; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11492; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11493; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11494; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11495; GFX940-TGSPLIT-NEXT: s_endpgm 11496; 11497; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11498; GFX11-WGP: ; %bb.0: ; %entry 11499; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11500; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11501; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11502; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11503; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11504; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11505; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11506; GFX11-WGP-NEXT: s_endpgm 11507; 11508; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11509; GFX11-CU: ; %bb.0: ; %entry 11510; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11511; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11512; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11513; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11514; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11515; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11516; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11517; GFX11-CU-NEXT: s_endpgm 11518; 11519; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11520; GFX12-WGP: ; %bb.0: ; %entry 11521; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11522; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11523; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11524; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11525; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11526; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11527; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11528; GFX12-WGP-NEXT: s_endpgm 11529; 11530; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: 11531; GFX12-CU: ; %bb.0: ; %entry 11532; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11533; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11534; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11535; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11536; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11537; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11538; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11539; GFX12-CU-NEXT: s_endpgm 11540 ptr %out, i32 %in) { 11541entry: 11542 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel 11543 ret void 11544} 11545 11546define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( 11547; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11548; GFX7: ; %bb.0: ; %entry 11549; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11550; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 11551; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11552; GFX7-NEXT: v_mov_b32_e32 v0, s6 11553; GFX7-NEXT: v_mov_b32_e32 v1, s7 11554; GFX7-NEXT: v_mov_b32_e32 v2, s4 11555; GFX7-NEXT: flat_atomic_swap v[0:1], v2 11556; GFX7-NEXT: s_endpgm 11557; 11558; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11559; GFX10-WGP: ; %bb.0: ; %entry 11560; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11561; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 11562; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11563; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 11564; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 11565; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 11566; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 11567; GFX10-WGP-NEXT: s_endpgm 11568; 11569; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11570; GFX10-CU: ; %bb.0: ; %entry 11571; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11572; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 11573; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11574; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 11575; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 11576; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 11577; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 11578; GFX10-CU-NEXT: s_endpgm 11579; 11580; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11581; SKIP-CACHE-INV: ; %bb.0: ; %entry 11582; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11583; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 11584; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11585; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 11586; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 11587; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 11588; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 11589; SKIP-CACHE-INV-NEXT: s_endpgm 11590; 11591; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11592; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11593; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11594; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11595; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11596; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11597; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11598; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11599; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11600; 11601; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11602; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11603; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 11604; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 11605; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11606; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] 11607; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 11608; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11609; GFX90A-TGSPLIT-NEXT: s_endpgm 11610; 11611; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11612; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11613; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11614; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11615; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11616; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11617; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11618; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11619; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11620; 11621; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11622; GFX940-TGSPLIT: ; %bb.0: ; %entry 11623; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 11624; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 11625; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11626; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] 11627; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 11628; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 11629; GFX940-TGSPLIT-NEXT: s_endpgm 11630; 11631; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11632; GFX11-WGP: ; %bb.0: ; %entry 11633; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11634; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11635; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11636; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 11637; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 11638; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 11639; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11640; GFX11-WGP-NEXT: s_endpgm 11641; 11642; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11643; GFX11-CU: ; %bb.0: ; %entry 11644; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11645; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11646; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11647; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 11648; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 11649; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 11650; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11651; GFX11-CU-NEXT: s_endpgm 11652; 11653; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11654; GFX12-WGP: ; %bb.0: ; %entry 11655; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11656; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 11657; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11658; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 11659; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 11660; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 11661; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 11662; GFX12-WGP-NEXT: s_endpgm 11663; 11664; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: 11665; GFX12-CU: ; %bb.0: ; %entry 11666; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 11667; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 11668; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11669; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 11670; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 11671; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 11672; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 11673; GFX12-CU-NEXT: s_endpgm 11674 ptr %out, i32 %in) { 11675entry: 11676 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst 11677 ret void 11678} 11679 11680define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( 11681; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11682; GFX7: ; %bb.0: ; %entry 11683; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11684; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 11685; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11686; GFX7-NEXT: v_mov_b32_e32 v0, s4 11687; GFX7-NEXT: v_mov_b32_e32 v1, s5 11688; GFX7-NEXT: v_mov_b32_e32 v2, s6 11689; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11690; GFX7-NEXT: v_mov_b32_e32 v0, s4 11691; GFX7-NEXT: v_mov_b32_e32 v1, s5 11692; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11693; GFX7-NEXT: flat_store_dword v[0:1], v2 11694; GFX7-NEXT: s_endpgm 11695; 11696; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11697; GFX10-WGP: ; %bb.0: ; %entry 11698; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11699; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 11700; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11701; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11702; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11703; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 11704; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11705; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11706; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11707; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11708; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11709; GFX10-WGP-NEXT: s_endpgm 11710; 11711; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11712; GFX10-CU: ; %bb.0: ; %entry 11713; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11714; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 11715; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11716; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11717; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11718; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 11719; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11720; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11721; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11722; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11723; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11724; GFX10-CU-NEXT: s_endpgm 11725; 11726; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11727; SKIP-CACHE-INV: ; %bb.0: ; %entry 11728; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11729; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 11730; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11731; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11732; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11733; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 11734; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11735; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11736; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11737; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11738; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11739; SKIP-CACHE-INV-NEXT: s_endpgm 11740; 11741; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11742; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11743; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11744; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11745; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11746; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11747; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11748; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11749; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11750; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11751; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11752; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11753; 11754; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11755; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11756; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11757; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11758; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11759; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11760; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11761; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11762; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11763; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11764; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11765; GFX90A-TGSPLIT-NEXT: s_endpgm 11766; 11767; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11768; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11769; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11770; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11771; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11772; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11773; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11774; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11775; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11776; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11777; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11778; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11779; 11780; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11781; GFX940-TGSPLIT: ; %bb.0: ; %entry 11782; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11783; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11784; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11785; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11786; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11787; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11788; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11789; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11790; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11791; GFX940-TGSPLIT-NEXT: s_endpgm 11792; 11793; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11794; GFX11-WGP: ; %bb.0: ; %entry 11795; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11796; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 11797; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11798; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11799; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11800; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 11801; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11802; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11803; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11804; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11805; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11806; GFX11-WGP-NEXT: s_endpgm 11807; 11808; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11809; GFX11-CU: ; %bb.0: ; %entry 11810; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11811; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 11812; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11813; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11814; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11815; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 11816; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11817; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11818; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11819; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11820; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 11821; GFX11-CU-NEXT: s_endpgm 11822; 11823; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11824; GFX12-WGP: ; %bb.0: ; %entry 11825; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11826; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 11827; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 11828; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 11829; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 11830; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 11831; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 11832; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 11833; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 11834; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 11835; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 11836; GFX12-WGP-NEXT: s_endpgm 11837; 11838; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: 11839; GFX12-CU: ; %bb.0: ; %entry 11840; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11841; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 11842; GFX12-CU-NEXT: s_wait_kmcnt 0x0 11843; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 11844; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 11845; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 11846; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 11847; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 11848; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 11849; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 11850; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 11851; GFX12-CU-NEXT: s_endpgm 11852 ptr %out, i32 %in) { 11853entry: 11854 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire 11855 store i32 %val, ptr %out, align 4 11856 ret void 11857} 11858 11859define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( 11860; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11861; GFX7: ; %bb.0: ; %entry 11862; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11863; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 11864; GFX7-NEXT: s_waitcnt lgkmcnt(0) 11865; GFX7-NEXT: v_mov_b32_e32 v0, s4 11866; GFX7-NEXT: v_mov_b32_e32 v1, s5 11867; GFX7-NEXT: v_mov_b32_e32 v2, s6 11868; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11869; GFX7-NEXT: v_mov_b32_e32 v0, s4 11870; GFX7-NEXT: v_mov_b32_e32 v1, s5 11871; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11872; GFX7-NEXT: flat_store_dword v[0:1], v2 11873; GFX7-NEXT: s_endpgm 11874; 11875; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11876; GFX10-WGP: ; %bb.0: ; %entry 11877; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11878; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 11879; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 11880; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11881; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11882; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 11883; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11884; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 11885; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 11886; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11887; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 11888; GFX10-WGP-NEXT: s_endpgm 11889; 11890; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11891; GFX10-CU: ; %bb.0: ; %entry 11892; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11893; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 11894; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 11895; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11896; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11897; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 11898; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11899; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 11900; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 11901; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11902; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 11903; GFX10-CU-NEXT: s_endpgm 11904; 11905; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11906; SKIP-CACHE-INV: ; %bb.0: ; %entry 11907; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11908; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 11909; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 11910; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11911; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 11913; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11914; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 11915; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 11916; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11917; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 11918; SKIP-CACHE-INV-NEXT: s_endpgm 11919; 11920; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11921; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 11922; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11923; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11924; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11925; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11926; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11927; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11928; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11929; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11930; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 11931; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 11932; 11933; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11934; GFX90A-TGSPLIT: ; %bb.0: ; %entry 11935; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 11936; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 11937; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11938; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11939; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 11940; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 11941; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 11942; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11943; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 11944; GFX90A-TGSPLIT-NEXT: s_endpgm 11945; 11946; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11947; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 11948; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11949; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11950; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11951; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11952; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11953; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11954; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11955; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11956; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11957; GFX940-NOTTGSPLIT-NEXT: s_endpgm 11958; 11959; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11960; GFX940-TGSPLIT: ; %bb.0: ; %entry 11961; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11962; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 11963; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 11964; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11965; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 11966; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 11967; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 11968; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 11969; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 11970; GFX940-TGSPLIT-NEXT: s_endpgm 11971; 11972; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11973; GFX11-WGP: ; %bb.0: ; %entry 11974; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11975; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 11976; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 11977; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11978; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11979; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 11980; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11981; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 11982; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 11983; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11984; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 11985; GFX11-WGP-NEXT: s_endpgm 11986; 11987; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 11988; GFX11-CU: ; %bb.0: ; %entry 11989; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 11990; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 11991; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 11992; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11993; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11994; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 11995; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 11996; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 11997; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 11998; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11999; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12000; GFX11-CU-NEXT: s_endpgm 12001; 12002; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 12003; GFX12-WGP: ; %bb.0: ; %entry 12004; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12005; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12006; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12007; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12008; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12009; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 12010; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12011; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12012; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12013; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 12014; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 12015; GFX12-WGP-NEXT: s_endpgm 12016; 12017; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: 12018; GFX12-CU: ; %bb.0: ; %entry 12019; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12020; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12021; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12022; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12023; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12024; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 12025; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12026; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12027; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12028; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 12029; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 12030; GFX12-CU-NEXT: s_endpgm 12031 ptr %out, i32 %in) { 12032entry: 12033 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel 12034 store i32 %val, ptr %out, align 4 12035 ret void 12036} 12037 12038define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( 12039; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12040; GFX7: ; %bb.0: ; %entry 12041; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12042; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 12043; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12044; GFX7-NEXT: v_mov_b32_e32 v0, s4 12045; GFX7-NEXT: v_mov_b32_e32 v1, s5 12046; GFX7-NEXT: v_mov_b32_e32 v2, s6 12047; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12048; GFX7-NEXT: v_mov_b32_e32 v0, s4 12049; GFX7-NEXT: v_mov_b32_e32 v1, s5 12050; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12051; GFX7-NEXT: flat_store_dword v[0:1], v2 12052; GFX7-NEXT: s_endpgm 12053; 12054; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12055; GFX10-WGP: ; %bb.0: ; %entry 12056; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12057; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 12058; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12059; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12060; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12061; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 12062; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12063; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12064; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12065; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12066; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 12067; GFX10-WGP-NEXT: s_endpgm 12068; 12069; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12070; GFX10-CU: ; %bb.0: ; %entry 12071; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12072; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 12073; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12074; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12075; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12076; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 12077; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12078; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12079; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12080; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12081; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 12082; GFX10-CU-NEXT: s_endpgm 12083; 12084; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12085; SKIP-CACHE-INV: ; %bb.0: ; %entry 12086; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12087; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 12088; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 12092; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12093; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12095; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12096; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 12097; SKIP-CACHE-INV-NEXT: s_endpgm 12098; 12099; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12100; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12101; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12102; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12103; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12104; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12105; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12106; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12107; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12108; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12109; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 12110; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12111; 12112; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12113; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12114; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12115; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 12116; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12117; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12118; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 12119; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 12120; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12121; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12122; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 12123; GFX90A-TGSPLIT-NEXT: s_endpgm 12124; 12125; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12126; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12127; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12128; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12129; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12130; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12131; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12132; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12133; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12134; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12135; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12136; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12137; 12138; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12139; GFX940-TGSPLIT: ; %bb.0: ; %entry 12140; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12141; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 12142; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12143; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12144; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 12145; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 12146; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12147; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 12148; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 12149; GFX940-TGSPLIT-NEXT: s_endpgm 12150; 12151; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12152; GFX11-WGP: ; %bb.0: ; %entry 12153; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12154; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12155; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12156; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12157; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12158; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 12159; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12160; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12161; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12162; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12163; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 12164; GFX11-WGP-NEXT: s_endpgm 12165; 12166; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12167; GFX11-CU: ; %bb.0: ; %entry 12168; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12169; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12170; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12171; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12172; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12173; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 12174; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc 12175; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12176; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12177; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12178; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 12179; GFX11-CU-NEXT: s_endpgm 12180; 12181; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12182; GFX12-WGP: ; %bb.0: ; %entry 12183; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12184; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 12185; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12186; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12187; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12188; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 12189; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12190; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12191; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12192; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 12193; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 12194; GFX12-WGP-NEXT: s_endpgm 12195; 12196; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: 12197; GFX12-CU: ; %bb.0: ; %entry 12198; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12199; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 12200; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12201; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12202; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12203; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 12204; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN 12205; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12206; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12207; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 12208; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 12209; GFX12-CU-NEXT: s_endpgm 12210 ptr %out, i32 %in) { 12211entry: 12212 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst 12213 store i32 %val, ptr %out, align 4 12214 ret void 12215} 12216 12217define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( 12218; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12219; GFX7: ; %bb.0: ; %entry 12220; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12221; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12222; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12223; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12224; GFX7-NEXT: s_mov_b64 s[10:11], 16 12225; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12226; GFX7-NEXT: s_mov_b32 s4, s8 12227; GFX7-NEXT: s_mov_b32 s5, s9 12228; GFX7-NEXT: s_mov_b32 s9, s10 12229; GFX7-NEXT: s_mov_b32 s8, s11 12230; GFX7-NEXT: s_add_u32 s4, s4, s9 12231; GFX7-NEXT: s_addc_u32 s8, s5, s8 12232; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12233; GFX7-NEXT: s_mov_b32 s5, s8 12234; GFX7-NEXT: v_mov_b32_e32 v2, s7 12235; GFX7-NEXT: v_mov_b32_e32 v0, s6 12236; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12237; GFX7-NEXT: v_mov_b32_e32 v3, v0 12238; GFX7-NEXT: v_mov_b32_e32 v0, s4 12239; GFX7-NEXT: v_mov_b32_e32 v1, s5 12240; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12241; GFX7-NEXT: s_endpgm 12242; 12243; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12244; GFX10-WGP: ; %bb.0: ; %entry 12245; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12246; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12247; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12248; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12249; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12250; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12251; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12252; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12253; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12254; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12255; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12256; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12257; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12258; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12259; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12260; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12261; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12262; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12263; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12264; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12265; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12266; GFX10-WGP-NEXT: s_endpgm 12267; 12268; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12269; GFX10-CU: ; %bb.0: ; %entry 12270; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12271; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12272; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12273; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12274; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12275; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12276; GFX10-CU-NEXT: s_mov_b32 s4, s8 12277; GFX10-CU-NEXT: s_mov_b32 s5, s9 12278; GFX10-CU-NEXT: s_mov_b32 s9, s10 12279; GFX10-CU-NEXT: s_mov_b32 s8, s11 12280; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12281; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12282; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12283; GFX10-CU-NEXT: s_mov_b32 s5, s8 12284; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12285; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12286; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12287; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12288; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12289; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12290; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12291; GFX10-CU-NEXT: s_endpgm 12292; 12293; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12294; SKIP-CACHE-INV: ; %bb.0: ; %entry 12295; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12296; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12297; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12298; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12299; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12300; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12301; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12302; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12303; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12304; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12305; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12306; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12307; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12308; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12309; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12310; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12311; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12312; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12313; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12314; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12315; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12316; SKIP-CACHE-INV-NEXT: s_endpgm 12317; 12318; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12319; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12320; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12321; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12322; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12323; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12324; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12325; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12326; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12327; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12328; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12329; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12330; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12331; 12332; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12333; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12334; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12335; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12336; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12337; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12338; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12339; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12340; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12341; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12342; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12343; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12344; GFX90A-TGSPLIT-NEXT: s_endpgm 12345; 12346; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12347; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12348; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12349; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12350; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12351; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12352; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12353; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12354; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12355; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12356; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12357; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12358; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12359; 12360; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12361; GFX940-TGSPLIT: ; %bb.0: ; %entry 12362; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12363; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12364; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12365; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12366; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12367; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12368; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12369; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12370; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12371; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12372; GFX940-TGSPLIT-NEXT: s_endpgm 12373; 12374; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12375; GFX11-WGP: ; %bb.0: ; %entry 12376; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12377; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12378; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12379; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12380; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 12381; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 12382; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12383; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 12384; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12385; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12386; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12387; GFX11-WGP-NEXT: s_endpgm 12388; 12389; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12390; GFX11-CU: ; %bb.0: ; %entry 12391; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12392; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12393; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12394; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12395; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 12396; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 12397; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12398; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 12399; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12400; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12401; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12402; GFX11-CU-NEXT: s_endpgm 12403; 12404; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12405; GFX12-WGP: ; %bb.0: ; %entry 12406; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12407; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12408; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12409; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12410; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 12411; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 12412; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12413; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 12414; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12415; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12416; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12417; GFX12-WGP-NEXT: s_endpgm 12418; 12419; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: 12420; GFX12-CU: ; %bb.0: ; %entry 12421; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12422; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12423; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12424; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12425; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 12426; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 12427; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12428; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 12429; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12430; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12431; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12432; GFX12-CU-NEXT: s_endpgm 12433 ptr %out, i32 %in, i32 %old) { 12434entry: 12435 %gep = getelementptr i32, ptr %out, i32 4 12436 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 12437 ret void 12438} 12439 12440define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( 12441; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12442; GFX7: ; %bb.0: ; %entry 12443; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12444; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12445; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12446; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12447; GFX7-NEXT: s_mov_b64 s[10:11], 16 12448; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12449; GFX7-NEXT: s_mov_b32 s4, s8 12450; GFX7-NEXT: s_mov_b32 s5, s9 12451; GFX7-NEXT: s_mov_b32 s9, s10 12452; GFX7-NEXT: s_mov_b32 s8, s11 12453; GFX7-NEXT: s_add_u32 s4, s4, s9 12454; GFX7-NEXT: s_addc_u32 s8, s5, s8 12455; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12456; GFX7-NEXT: s_mov_b32 s5, s8 12457; GFX7-NEXT: v_mov_b32_e32 v2, s7 12458; GFX7-NEXT: v_mov_b32_e32 v0, s6 12459; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12460; GFX7-NEXT: v_mov_b32_e32 v3, v0 12461; GFX7-NEXT: v_mov_b32_e32 v0, s4 12462; GFX7-NEXT: v_mov_b32_e32 v1, s5 12463; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12464; GFX7-NEXT: s_endpgm 12465; 12466; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12467; GFX10-WGP: ; %bb.0: ; %entry 12468; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12469; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12470; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12471; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12472; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12473; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12474; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12475; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12476; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12477; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12478; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12479; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12480; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12481; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12482; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12483; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12484; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12485; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12486; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12487; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12488; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12489; GFX10-WGP-NEXT: s_endpgm 12490; 12491; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12492; GFX10-CU: ; %bb.0: ; %entry 12493; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12494; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12495; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12496; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12497; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12498; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12499; GFX10-CU-NEXT: s_mov_b32 s4, s8 12500; GFX10-CU-NEXT: s_mov_b32 s5, s9 12501; GFX10-CU-NEXT: s_mov_b32 s9, s10 12502; GFX10-CU-NEXT: s_mov_b32 s8, s11 12503; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12504; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12505; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12506; GFX10-CU-NEXT: s_mov_b32 s5, s8 12507; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12508; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12509; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12510; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12511; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12512; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12513; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12514; GFX10-CU-NEXT: s_endpgm 12515; 12516; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12517; SKIP-CACHE-INV: ; %bb.0: ; %entry 12518; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12519; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12520; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12521; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12522; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12523; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12524; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12525; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12526; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12527; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12528; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12529; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12530; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12531; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12532; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12533; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12534; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12536; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12537; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12538; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12539; SKIP-CACHE-INV-NEXT: s_endpgm 12540; 12541; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12542; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12543; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12544; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12545; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12546; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12547; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12548; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12549; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12550; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12551; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12552; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12553; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12554; 12555; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12556; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12557; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12558; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12559; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12560; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12561; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12562; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12563; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12564; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12565; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12566; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12567; GFX90A-TGSPLIT-NEXT: s_endpgm 12568; 12569; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12570; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12571; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12572; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12573; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12574; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12575; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12576; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12577; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12578; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12579; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12580; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12581; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12582; 12583; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12584; GFX940-TGSPLIT: ; %bb.0: ; %entry 12585; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12586; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12587; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12588; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12589; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12590; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12591; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12592; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12593; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12594; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12595; GFX940-TGSPLIT-NEXT: s_endpgm 12596; 12597; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12598; GFX11-WGP: ; %bb.0: ; %entry 12599; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12600; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12601; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12602; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12603; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 12604; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 12605; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12606; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 12607; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12608; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12609; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12610; GFX11-WGP-NEXT: s_endpgm 12611; 12612; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12613; GFX11-CU: ; %bb.0: ; %entry 12614; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12615; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12616; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12617; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12618; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 12619; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 12620; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12621; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 12622; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12623; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12624; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12625; GFX11-CU-NEXT: s_endpgm 12626; 12627; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12628; GFX12-WGP: ; %bb.0: ; %entry 12629; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12630; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12631; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12632; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12633; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 12634; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 12635; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12636; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 12637; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12638; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12639; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12640; GFX12-WGP-NEXT: s_endpgm 12641; 12642; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: 12643; GFX12-CU: ; %bb.0: ; %entry 12644; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12645; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12646; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12647; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12648; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 12649; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 12650; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12651; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 12652; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12653; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12654; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12655; GFX12-CU-NEXT: s_endpgm 12656 ptr %out, i32 %in, i32 %old) { 12657entry: 12658 %gep = getelementptr i32, ptr %out, i32 4 12659 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 12660 ret void 12661} 12662 12663define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( 12664; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12665; GFX7: ; %bb.0: ; %entry 12666; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12667; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12668; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12669; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12670; GFX7-NEXT: s_mov_b64 s[10:11], 16 12671; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12672; GFX7-NEXT: s_mov_b32 s4, s8 12673; GFX7-NEXT: s_mov_b32 s5, s9 12674; GFX7-NEXT: s_mov_b32 s9, s10 12675; GFX7-NEXT: s_mov_b32 s8, s11 12676; GFX7-NEXT: s_add_u32 s4, s4, s9 12677; GFX7-NEXT: s_addc_u32 s8, s5, s8 12678; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12679; GFX7-NEXT: s_mov_b32 s5, s8 12680; GFX7-NEXT: v_mov_b32_e32 v2, s7 12681; GFX7-NEXT: v_mov_b32_e32 v0, s6 12682; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12683; GFX7-NEXT: v_mov_b32_e32 v3, v0 12684; GFX7-NEXT: v_mov_b32_e32 v0, s4 12685; GFX7-NEXT: v_mov_b32_e32 v1, s5 12686; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12687; GFX7-NEXT: s_endpgm 12688; 12689; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12690; GFX10-WGP: ; %bb.0: ; %entry 12691; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12692; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12693; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12694; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12695; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12696; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12697; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12698; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12699; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12700; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12701; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12702; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12703; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12704; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12705; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12706; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12707; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12708; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12709; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12710; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12711; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12712; GFX10-WGP-NEXT: s_endpgm 12713; 12714; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12715; GFX10-CU: ; %bb.0: ; %entry 12716; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12717; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12718; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12719; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12720; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12721; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12722; GFX10-CU-NEXT: s_mov_b32 s4, s8 12723; GFX10-CU-NEXT: s_mov_b32 s5, s9 12724; GFX10-CU-NEXT: s_mov_b32 s9, s10 12725; GFX10-CU-NEXT: s_mov_b32 s8, s11 12726; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12727; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12728; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12729; GFX10-CU-NEXT: s_mov_b32 s5, s8 12730; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12731; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12732; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12733; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12734; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12735; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12736; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12737; GFX10-CU-NEXT: s_endpgm 12738; 12739; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12740; SKIP-CACHE-INV: ; %bb.0: ; %entry 12741; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12742; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12743; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12744; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12745; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12746; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12747; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12748; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12749; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12750; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12751; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12752; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12753; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12754; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12755; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12756; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12757; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12758; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12759; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12760; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12761; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12762; SKIP-CACHE-INV-NEXT: s_endpgm 12763; 12764; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12765; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12766; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12767; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12768; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12769; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12770; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12771; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12772; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12773; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12774; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12775; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12776; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 12777; 12778; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12779; GFX90A-TGSPLIT: ; %bb.0: ; %entry 12780; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12781; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12782; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12783; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12784; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12785; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12786; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12787; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12788; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12789; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12790; GFX90A-TGSPLIT-NEXT: s_endpgm 12791; 12792; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12793; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 12794; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12795; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12796; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12797; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12798; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12799; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12800; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12801; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12802; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12803; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12804; GFX940-NOTTGSPLIT-NEXT: s_endpgm 12805; 12806; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12807; GFX940-TGSPLIT: ; %bb.0: ; %entry 12808; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 12809; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 12810; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 12811; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12812; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 12813; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 12814; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12815; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12816; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 12817; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12818; GFX940-TGSPLIT-NEXT: s_endpgm 12819; 12820; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12821; GFX11-WGP: ; %bb.0: ; %entry 12822; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12823; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12824; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12825; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 12826; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 12827; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 12828; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12829; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 12830; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 12831; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 12832; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12833; GFX11-WGP-NEXT: s_endpgm 12834; 12835; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12836; GFX11-CU: ; %bb.0: ; %entry 12837; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12838; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12839; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12840; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 12841; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 12842; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 12843; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12844; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 12845; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 12846; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 12847; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12848; GFX11-CU-NEXT: s_endpgm 12849; 12850; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12851; GFX12-WGP: ; %bb.0: ; %entry 12852; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12853; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 12854; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 12855; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 12856; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 12857; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 12858; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12859; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 12860; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 12861; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 12862; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12863; GFX12-WGP-NEXT: s_endpgm 12864; 12865; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: 12866; GFX12-CU: ; %bb.0: ; %entry 12867; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 12868; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 12869; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 12870; GFX12-CU-NEXT: s_wait_kmcnt 0x0 12871; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 12872; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 12873; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12874; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 12875; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 12876; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 12877; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 12878; GFX12-CU-NEXT: s_endpgm 12879 ptr %out, i32 %in, i32 %old) { 12880entry: 12881 %gep = getelementptr i32, ptr %out, i32 4 12882 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 12883 ret void 12884} 12885 12886define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( 12887; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 12888; GFX7: ; %bb.0: ; %entry 12889; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 12890; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12891; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 12892; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 12893; GFX7-NEXT: s_mov_b64 s[10:11], 16 12894; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12895; GFX7-NEXT: s_mov_b32 s4, s8 12896; GFX7-NEXT: s_mov_b32 s5, s9 12897; GFX7-NEXT: s_mov_b32 s9, s10 12898; GFX7-NEXT: s_mov_b32 s8, s11 12899; GFX7-NEXT: s_add_u32 s4, s4, s9 12900; GFX7-NEXT: s_addc_u32 s8, s5, s8 12901; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12902; GFX7-NEXT: s_mov_b32 s5, s8 12903; GFX7-NEXT: v_mov_b32_e32 v2, s7 12904; GFX7-NEXT: v_mov_b32_e32 v0, s6 12905; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12906; GFX7-NEXT: v_mov_b32_e32 v3, v0 12907; GFX7-NEXT: v_mov_b32_e32 v0, s4 12908; GFX7-NEXT: v_mov_b32_e32 v1, s5 12909; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12910; GFX7-NEXT: s_endpgm 12911; 12912; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 12913; GFX10-WGP: ; %bb.0: ; %entry 12914; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 12915; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12916; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 12917; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 12918; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 12919; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 12920; GFX10-WGP-NEXT: s_mov_b32 s4, s8 12921; GFX10-WGP-NEXT: s_mov_b32 s5, s9 12922; GFX10-WGP-NEXT: s_mov_b32 s9, s10 12923; GFX10-WGP-NEXT: s_mov_b32 s8, s11 12924; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 12925; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 12926; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12927; GFX10-WGP-NEXT: s_mov_b32 s5, s8 12928; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 12929; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 12930; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12931; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 12932; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 12933; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 12934; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12935; GFX10-WGP-NEXT: s_endpgm 12936; 12937; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 12938; GFX10-CU: ; %bb.0: ; %entry 12939; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 12940; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 12941; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 12942; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 12943; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 12944; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 12945; GFX10-CU-NEXT: s_mov_b32 s4, s8 12946; GFX10-CU-NEXT: s_mov_b32 s5, s9 12947; GFX10-CU-NEXT: s_mov_b32 s9, s10 12948; GFX10-CU-NEXT: s_mov_b32 s8, s11 12949; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 12950; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 12951; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 12952; GFX10-CU-NEXT: s_mov_b32 s5, s8 12953; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 12954; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 12955; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12956; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 12957; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 12958; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 12959; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12960; GFX10-CU-NEXT: s_endpgm 12961; 12962; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 12963; SKIP-CACHE-INV: ; %bb.0: ; %entry 12964; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 12965; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 12966; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 12967; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 12968; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 12969; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 12970; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 12971; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 12972; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 12973; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 12974; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 12975; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 12976; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 12977; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 12978; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 12979; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 12980; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12981; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 12982; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 12983; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 12984; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 12985; SKIP-CACHE-INV-NEXT: s_endpgm 12986; 12987; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 12988; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 12989; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 12990; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 12991; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 12992; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 12993; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 12994; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 12995; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 12996; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 12997; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 12998; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 12999; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13000; 13001; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13002; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13003; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13004; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13005; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13006; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13007; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13008; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13009; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13010; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13011; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13012; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13013; GFX90A-TGSPLIT-NEXT: s_endpgm 13014; 13015; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13016; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13017; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13018; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13019; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13020; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13021; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13022; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13023; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13024; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13025; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13026; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13027; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13028; 13029; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13030; GFX940-TGSPLIT: ; %bb.0: ; %entry 13031; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13032; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13033; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13034; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13035; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13036; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13037; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13038; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13039; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13040; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13041; GFX940-TGSPLIT-NEXT: s_endpgm 13042; 13043; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13044; GFX11-WGP: ; %bb.0: ; %entry 13045; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13046; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13047; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13048; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13049; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13050; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13051; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13052; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13053; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13054; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13055; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13056; GFX11-WGP-NEXT: s_endpgm 13057; 13058; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13059; GFX11-CU: ; %bb.0: ; %entry 13060; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13061; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13062; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13063; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13064; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13065; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13066; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13067; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13068; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13069; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13070; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13071; GFX11-CU-NEXT: s_endpgm 13072; 13073; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13074; GFX12-WGP: ; %bb.0: ; %entry 13075; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13076; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13077; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13078; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13079; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13080; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13081; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13082; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13083; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13084; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13085; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13086; GFX12-WGP-NEXT: s_endpgm 13087; 13088; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: 13089; GFX12-CU: ; %bb.0: ; %entry 13090; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13091; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13092; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13093; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13094; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13095; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13096; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13097; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13098; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13099; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13100; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13101; GFX12-CU-NEXT: s_endpgm 13102 ptr %out, i32 %in, i32 %old) { 13103entry: 13104 %gep = getelementptr i32, ptr %out, i32 4 13105 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 13106 ret void 13107} 13108 13109define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( 13110; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13111; GFX7: ; %bb.0: ; %entry 13112; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13113; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13114; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13115; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13116; GFX7-NEXT: s_mov_b64 s[10:11], 16 13117; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13118; GFX7-NEXT: s_mov_b32 s4, s8 13119; GFX7-NEXT: s_mov_b32 s5, s9 13120; GFX7-NEXT: s_mov_b32 s9, s10 13121; GFX7-NEXT: s_mov_b32 s8, s11 13122; GFX7-NEXT: s_add_u32 s4, s4, s9 13123; GFX7-NEXT: s_addc_u32 s8, s5, s8 13124; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13125; GFX7-NEXT: s_mov_b32 s5, s8 13126; GFX7-NEXT: v_mov_b32_e32 v2, s7 13127; GFX7-NEXT: v_mov_b32_e32 v0, s6 13128; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13129; GFX7-NEXT: v_mov_b32_e32 v3, v0 13130; GFX7-NEXT: v_mov_b32_e32 v0, s4 13131; GFX7-NEXT: v_mov_b32_e32 v1, s5 13132; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13133; GFX7-NEXT: s_endpgm 13134; 13135; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13136; GFX10-WGP: ; %bb.0: ; %entry 13137; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13138; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13139; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13140; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13141; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13142; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13143; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13144; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13145; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13146; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13147; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13148; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13149; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13150; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13151; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13152; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13153; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13154; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13155; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13156; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13157; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13158; GFX10-WGP-NEXT: s_endpgm 13159; 13160; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13161; GFX10-CU: ; %bb.0: ; %entry 13162; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13163; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13164; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13165; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13166; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13167; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13168; GFX10-CU-NEXT: s_mov_b32 s4, s8 13169; GFX10-CU-NEXT: s_mov_b32 s5, s9 13170; GFX10-CU-NEXT: s_mov_b32 s9, s10 13171; GFX10-CU-NEXT: s_mov_b32 s8, s11 13172; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13173; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13174; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13175; GFX10-CU-NEXT: s_mov_b32 s5, s8 13176; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13177; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13178; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13179; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13180; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13181; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13182; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13183; GFX10-CU-NEXT: s_endpgm 13184; 13185; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13186; SKIP-CACHE-INV: ; %bb.0: ; %entry 13187; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13188; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13189; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13190; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13191; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13192; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13193; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13194; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13195; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13196; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13197; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13198; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13199; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13200; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13201; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13202; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13203; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13205; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13206; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13207; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13208; SKIP-CACHE-INV-NEXT: s_endpgm 13209; 13210; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13211; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13212; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13213; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13214; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13215; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13216; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13217; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13218; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13219; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13220; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13221; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13222; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13223; 13224; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13225; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13226; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13227; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13228; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13229; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13230; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13231; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13232; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13233; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13234; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13235; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13236; GFX90A-TGSPLIT-NEXT: s_endpgm 13237; 13238; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13239; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13240; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13241; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13242; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13243; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13244; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13245; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13246; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13247; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13248; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13249; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13250; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13251; 13252; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13253; GFX940-TGSPLIT: ; %bb.0: ; %entry 13254; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13255; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13256; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13257; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13258; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13259; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13260; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13261; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13262; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13263; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13264; GFX940-TGSPLIT-NEXT: s_endpgm 13265; 13266; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13267; GFX11-WGP: ; %bb.0: ; %entry 13268; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13269; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13270; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13271; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13272; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13273; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13274; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13275; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13276; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13277; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13278; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13279; GFX11-WGP-NEXT: s_endpgm 13280; 13281; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13282; GFX11-CU: ; %bb.0: ; %entry 13283; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13284; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13285; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13286; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13287; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13288; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13289; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13290; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13291; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13292; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13293; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13294; GFX11-CU-NEXT: s_endpgm 13295; 13296; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13297; GFX12-WGP: ; %bb.0: ; %entry 13298; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13299; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13300; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13301; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13302; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13303; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13304; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13305; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13306; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13307; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13308; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13309; GFX12-WGP-NEXT: s_endpgm 13310; 13311; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: 13312; GFX12-CU: ; %bb.0: ; %entry 13313; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13314; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13315; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13316; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13317; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13318; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13319; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13320; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13321; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13322; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13323; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13324; GFX12-CU-NEXT: s_endpgm 13325 ptr %out, i32 %in, i32 %old) { 13326entry: 13327 %gep = getelementptr i32, ptr %out, i32 4 13328 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 13329 ret void 13330} 13331 13332define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( 13333; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13334; GFX7: ; %bb.0: ; %entry 13335; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13336; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13337; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13338; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13339; GFX7-NEXT: s_mov_b64 s[10:11], 16 13340; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13341; GFX7-NEXT: s_mov_b32 s4, s8 13342; GFX7-NEXT: s_mov_b32 s5, s9 13343; GFX7-NEXT: s_mov_b32 s9, s10 13344; GFX7-NEXT: s_mov_b32 s8, s11 13345; GFX7-NEXT: s_add_u32 s4, s4, s9 13346; GFX7-NEXT: s_addc_u32 s8, s5, s8 13347; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13348; GFX7-NEXT: s_mov_b32 s5, s8 13349; GFX7-NEXT: v_mov_b32_e32 v2, s7 13350; GFX7-NEXT: v_mov_b32_e32 v0, s6 13351; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13352; GFX7-NEXT: v_mov_b32_e32 v3, v0 13353; GFX7-NEXT: v_mov_b32_e32 v0, s4 13354; GFX7-NEXT: v_mov_b32_e32 v1, s5 13355; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13356; GFX7-NEXT: s_endpgm 13357; 13358; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13359; GFX10-WGP: ; %bb.0: ; %entry 13360; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13361; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13362; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13363; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13364; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13365; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13366; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13367; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13368; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13369; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13370; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13371; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13372; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13373; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13374; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13375; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13376; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13377; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13378; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13379; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13380; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13381; GFX10-WGP-NEXT: s_endpgm 13382; 13383; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13384; GFX10-CU: ; %bb.0: ; %entry 13385; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13386; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13387; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13388; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13389; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13390; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13391; GFX10-CU-NEXT: s_mov_b32 s4, s8 13392; GFX10-CU-NEXT: s_mov_b32 s5, s9 13393; GFX10-CU-NEXT: s_mov_b32 s9, s10 13394; GFX10-CU-NEXT: s_mov_b32 s8, s11 13395; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13396; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13397; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13398; GFX10-CU-NEXT: s_mov_b32 s5, s8 13399; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13400; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13401; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13402; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13403; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13404; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13405; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13406; GFX10-CU-NEXT: s_endpgm 13407; 13408; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13409; SKIP-CACHE-INV: ; %bb.0: ; %entry 13410; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13411; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13412; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13413; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13414; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13415; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13416; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13417; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13418; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13419; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13420; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13421; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13422; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13423; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13424; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13425; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13426; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13427; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13428; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13429; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13430; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13431; SKIP-CACHE-INV-NEXT: s_endpgm 13432; 13433; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13434; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13435; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13436; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13437; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13438; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13439; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13440; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13441; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13442; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13443; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13444; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13445; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13446; 13447; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13448; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13449; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13450; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13451; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13452; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13453; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13454; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13455; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13456; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13457; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13458; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13459; GFX90A-TGSPLIT-NEXT: s_endpgm 13460; 13461; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13462; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13463; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13464; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13465; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13466; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13467; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13468; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13469; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13470; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13471; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13472; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13473; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13474; 13475; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13476; GFX940-TGSPLIT: ; %bb.0: ; %entry 13477; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13478; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13479; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13480; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13481; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13482; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13483; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13484; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13485; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13486; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13487; GFX940-TGSPLIT-NEXT: s_endpgm 13488; 13489; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13490; GFX11-WGP: ; %bb.0: ; %entry 13491; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13492; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13493; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13494; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13495; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13496; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13497; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13498; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13499; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13500; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13501; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13502; GFX11-WGP-NEXT: s_endpgm 13503; 13504; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13505; GFX11-CU: ; %bb.0: ; %entry 13506; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13507; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13508; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13509; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13510; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13511; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13512; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13513; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13514; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13515; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13516; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13517; GFX11-CU-NEXT: s_endpgm 13518; 13519; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13520; GFX12-WGP: ; %bb.0: ; %entry 13521; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13522; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13523; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13524; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13525; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13526; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13527; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13528; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13529; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13530; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13531; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13532; GFX12-WGP-NEXT: s_endpgm 13533; 13534; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: 13535; GFX12-CU: ; %bb.0: ; %entry 13536; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13537; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13538; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13539; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13540; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13541; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13542; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13543; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13544; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13545; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13546; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13547; GFX12-CU-NEXT: s_endpgm 13548 ptr %out, i32 %in, i32 %old) { 13549entry: 13550 %gep = getelementptr i32, ptr %out, i32 4 13551 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 13552 ret void 13553} 13554 13555define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( 13556; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13557; GFX7: ; %bb.0: ; %entry 13558; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13559; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13560; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13561; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13562; GFX7-NEXT: s_mov_b64 s[10:11], 16 13563; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13564; GFX7-NEXT: s_mov_b32 s4, s8 13565; GFX7-NEXT: s_mov_b32 s5, s9 13566; GFX7-NEXT: s_mov_b32 s9, s10 13567; GFX7-NEXT: s_mov_b32 s8, s11 13568; GFX7-NEXT: s_add_u32 s4, s4, s9 13569; GFX7-NEXT: s_addc_u32 s8, s5, s8 13570; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13571; GFX7-NEXT: s_mov_b32 s5, s8 13572; GFX7-NEXT: v_mov_b32_e32 v2, s7 13573; GFX7-NEXT: v_mov_b32_e32 v0, s6 13574; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13575; GFX7-NEXT: v_mov_b32_e32 v3, v0 13576; GFX7-NEXT: v_mov_b32_e32 v0, s4 13577; GFX7-NEXT: v_mov_b32_e32 v1, s5 13578; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13579; GFX7-NEXT: s_endpgm 13580; 13581; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13582; GFX10-WGP: ; %bb.0: ; %entry 13583; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13584; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13585; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13586; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13587; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13588; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13589; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13590; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13591; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13592; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13593; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13594; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13595; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13596; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13597; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13598; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13599; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13600; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13601; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13602; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13603; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13604; GFX10-WGP-NEXT: s_endpgm 13605; 13606; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13607; GFX10-CU: ; %bb.0: ; %entry 13608; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13609; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13610; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13611; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13612; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13613; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13614; GFX10-CU-NEXT: s_mov_b32 s4, s8 13615; GFX10-CU-NEXT: s_mov_b32 s5, s9 13616; GFX10-CU-NEXT: s_mov_b32 s9, s10 13617; GFX10-CU-NEXT: s_mov_b32 s8, s11 13618; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13619; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13620; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13621; GFX10-CU-NEXT: s_mov_b32 s5, s8 13622; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13623; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13624; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13625; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13626; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13627; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13628; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13629; GFX10-CU-NEXT: s_endpgm 13630; 13631; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13632; SKIP-CACHE-INV: ; %bb.0: ; %entry 13633; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13634; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13635; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13636; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13637; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13638; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13639; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13640; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13641; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13642; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13643; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13644; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13645; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13646; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13649; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13650; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13651; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13652; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13653; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13654; SKIP-CACHE-INV-NEXT: s_endpgm 13655; 13656; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13657; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13658; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13659; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13660; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13661; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13662; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13663; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13664; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13665; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13666; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13667; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13668; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13669; 13670; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13671; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13672; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13673; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13674; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13675; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13676; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13677; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13678; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13679; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13680; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13681; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13682; GFX90A-TGSPLIT-NEXT: s_endpgm 13683; 13684; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13685; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13686; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13687; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13688; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13689; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13690; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13691; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13692; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13693; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13694; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13695; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13696; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13697; 13698; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13699; GFX940-TGSPLIT: ; %bb.0: ; %entry 13700; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13701; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13702; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13703; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13704; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13705; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13706; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13707; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13708; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13709; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13710; GFX940-TGSPLIT-NEXT: s_endpgm 13711; 13712; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13713; GFX11-WGP: ; %bb.0: ; %entry 13714; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13715; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13716; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13717; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13718; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13719; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13720; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13721; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13722; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13723; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13724; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13725; GFX11-WGP-NEXT: s_endpgm 13726; 13727; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13728; GFX11-CU: ; %bb.0: ; %entry 13729; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13730; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13731; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13732; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13733; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13734; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13735; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13736; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13737; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13738; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13739; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13740; GFX11-CU-NEXT: s_endpgm 13741; 13742; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13743; GFX12-WGP: ; %bb.0: ; %entry 13744; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13745; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13746; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13747; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13748; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13749; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13750; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13751; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13752; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13753; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13754; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13755; GFX12-WGP-NEXT: s_endpgm 13756; 13757; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: 13758; GFX12-CU: ; %bb.0: ; %entry 13759; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13760; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13761; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13762; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13763; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13764; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13765; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13766; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13767; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13768; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13769; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13770; GFX12-CU-NEXT: s_endpgm 13771 ptr %out, i32 %in, i32 %old) { 13772entry: 13773 %gep = getelementptr i32, ptr %out, i32 4 13774 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 13775 ret void 13776} 13777 13778define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( 13779; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13780; GFX7: ; %bb.0: ; %entry 13781; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 13782; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13783; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 13784; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 13785; GFX7-NEXT: s_mov_b64 s[10:11], 16 13786; GFX7-NEXT: s_waitcnt lgkmcnt(0) 13787; GFX7-NEXT: s_mov_b32 s4, s8 13788; GFX7-NEXT: s_mov_b32 s5, s9 13789; GFX7-NEXT: s_mov_b32 s9, s10 13790; GFX7-NEXT: s_mov_b32 s8, s11 13791; GFX7-NEXT: s_add_u32 s4, s4, s9 13792; GFX7-NEXT: s_addc_u32 s8, s5, s8 13793; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13794; GFX7-NEXT: s_mov_b32 s5, s8 13795; GFX7-NEXT: v_mov_b32_e32 v2, s7 13796; GFX7-NEXT: v_mov_b32_e32 v0, s6 13797; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13798; GFX7-NEXT: v_mov_b32_e32 v3, v0 13799; GFX7-NEXT: v_mov_b32_e32 v0, s4 13800; GFX7-NEXT: v_mov_b32_e32 v1, s5 13801; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13802; GFX7-NEXT: s_endpgm 13803; 13804; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13805; GFX10-WGP: ; %bb.0: ; %entry 13806; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 13807; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13808; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 13809; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 13810; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 13811; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 13812; GFX10-WGP-NEXT: s_mov_b32 s4, s8 13813; GFX10-WGP-NEXT: s_mov_b32 s5, s9 13814; GFX10-WGP-NEXT: s_mov_b32 s9, s10 13815; GFX10-WGP-NEXT: s_mov_b32 s8, s11 13816; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 13817; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 13818; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13819; GFX10-WGP-NEXT: s_mov_b32 s5, s8 13820; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 13821; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 13822; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13823; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 13824; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 13825; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 13826; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13827; GFX10-WGP-NEXT: s_endpgm 13828; 13829; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13830; GFX10-CU: ; %bb.0: ; %entry 13831; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 13832; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 13833; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 13834; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 13835; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 13836; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 13837; GFX10-CU-NEXT: s_mov_b32 s4, s8 13838; GFX10-CU-NEXT: s_mov_b32 s5, s9 13839; GFX10-CU-NEXT: s_mov_b32 s9, s10 13840; GFX10-CU-NEXT: s_mov_b32 s8, s11 13841; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 13842; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 13843; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 13844; GFX10-CU-NEXT: s_mov_b32 s5, s8 13845; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 13846; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 13847; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13848; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 13849; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 13850; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 13851; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13852; GFX10-CU-NEXT: s_endpgm 13853; 13854; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13855; SKIP-CACHE-INV: ; %bb.0: ; %entry 13856; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 13857; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 13858; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 13859; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 13860; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 13861; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 13862; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 13863; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 13864; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 13865; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 13866; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 13867; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 13868; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 13869; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 13870; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 13871; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 13872; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13873; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 13874; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 13875; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 13876; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 13877; SKIP-CACHE-INV-NEXT: s_endpgm 13878; 13879; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13880; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 13881; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13882; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13883; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13884; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13885; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13886; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13887; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13888; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13889; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13890; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13891; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 13892; 13893; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13894; GFX90A-TGSPLIT: ; %bb.0: ; %entry 13895; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 13896; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 13897; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 13898; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13899; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 13900; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 13901; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13902; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13903; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 13904; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13905; GFX90A-TGSPLIT-NEXT: s_endpgm 13906; 13907; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13908; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 13909; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13910; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13911; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13912; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13913; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13914; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13915; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13916; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13917; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13918; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13919; GFX940-NOTTGSPLIT-NEXT: s_endpgm 13920; 13921; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13922; GFX940-TGSPLIT: ; %bb.0: ; %entry 13923; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13924; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 13925; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 13926; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 13927; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 13928; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 13929; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13930; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 13931; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 13932; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 13933; GFX940-TGSPLIT-NEXT: s_endpgm 13934; 13935; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13936; GFX11-WGP: ; %bb.0: ; %entry 13937; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13938; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13939; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13940; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 13941; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 13942; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 13943; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13944; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 13945; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 13946; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 13947; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13948; GFX11-WGP-NEXT: s_endpgm 13949; 13950; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13951; GFX11-CU: ; %bb.0: ; %entry 13952; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13953; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13954; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13955; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 13956; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 13957; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 13958; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13959; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 13960; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 13961; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 13962; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13963; GFX11-CU-NEXT: s_endpgm 13964; 13965; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13966; GFX12-WGP: ; %bb.0: ; %entry 13967; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13968; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 13969; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 13970; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 13971; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 13972; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 13973; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13974; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 13975; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 13976; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 13977; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13978; GFX12-WGP-NEXT: s_endpgm 13979; 13980; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: 13981; GFX12-CU: ; %bb.0: ; %entry 13982; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 13983; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 13984; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 13985; GFX12-CU-NEXT: s_wait_kmcnt 0x0 13986; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 13987; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 13988; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 13989; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 13990; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 13991; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 13992; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 13993; GFX12-CU-NEXT: s_endpgm 13994 ptr %out, i32 %in, i32 %old) { 13995entry: 13996 %gep = getelementptr i32, ptr %out, i32 4 13997 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 13998 ret void 13999} 14000 14001define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( 14002; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14003; GFX7: ; %bb.0: ; %entry 14004; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14005; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14006; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14007; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14008; GFX7-NEXT: s_mov_b64 s[10:11], 16 14009; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14010; GFX7-NEXT: s_mov_b32 s4, s8 14011; GFX7-NEXT: s_mov_b32 s5, s9 14012; GFX7-NEXT: s_mov_b32 s9, s10 14013; GFX7-NEXT: s_mov_b32 s8, s11 14014; GFX7-NEXT: s_add_u32 s4, s4, s9 14015; GFX7-NEXT: s_addc_u32 s8, s5, s8 14016; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14017; GFX7-NEXT: s_mov_b32 s5, s8 14018; GFX7-NEXT: v_mov_b32_e32 v2, s7 14019; GFX7-NEXT: v_mov_b32_e32 v0, s6 14020; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14021; GFX7-NEXT: v_mov_b32_e32 v3, v0 14022; GFX7-NEXT: v_mov_b32_e32 v0, s4 14023; GFX7-NEXT: v_mov_b32_e32 v1, s5 14024; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14025; GFX7-NEXT: s_endpgm 14026; 14027; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14028; GFX10-WGP: ; %bb.0: ; %entry 14029; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14030; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14031; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14032; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14033; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14034; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14035; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14036; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14037; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14038; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14039; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14040; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14041; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14042; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14043; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14044; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14045; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14046; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14047; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14048; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14049; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14050; GFX10-WGP-NEXT: s_endpgm 14051; 14052; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14053; GFX10-CU: ; %bb.0: ; %entry 14054; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14055; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14056; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14057; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14058; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14059; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14060; GFX10-CU-NEXT: s_mov_b32 s4, s8 14061; GFX10-CU-NEXT: s_mov_b32 s5, s9 14062; GFX10-CU-NEXT: s_mov_b32 s9, s10 14063; GFX10-CU-NEXT: s_mov_b32 s8, s11 14064; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14065; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14066; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14067; GFX10-CU-NEXT: s_mov_b32 s5, s8 14068; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14069; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14070; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14071; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14072; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14073; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14074; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14075; GFX10-CU-NEXT: s_endpgm 14076; 14077; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14078; SKIP-CACHE-INV: ; %bb.0: ; %entry 14079; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14080; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14081; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14082; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14083; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14084; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14085; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14086; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14087; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14088; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14089; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14090; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14091; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14092; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14093; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14095; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14096; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14097; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14098; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14099; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14100; SKIP-CACHE-INV-NEXT: s_endpgm 14101; 14102; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14103; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14104; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14105; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14106; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14107; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14108; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14109; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14110; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14111; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14112; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14113; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14114; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14115; 14116; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14117; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14118; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14119; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14120; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14121; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14122; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14123; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14124; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14125; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14126; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14127; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14128; GFX90A-TGSPLIT-NEXT: s_endpgm 14129; 14130; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14131; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14132; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14133; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14134; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14135; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14136; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14137; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14138; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14139; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14140; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14141; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14142; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14143; 14144; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14145; GFX940-TGSPLIT: ; %bb.0: ; %entry 14146; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14147; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14148; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14149; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14150; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14151; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14152; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14153; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14154; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14155; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14156; GFX940-TGSPLIT-NEXT: s_endpgm 14157; 14158; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14159; GFX11-WGP: ; %bb.0: ; %entry 14160; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14161; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14162; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14163; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14164; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14165; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14166; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14167; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14168; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14169; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14170; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14171; GFX11-WGP-NEXT: s_endpgm 14172; 14173; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14174; GFX11-CU: ; %bb.0: ; %entry 14175; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14176; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14177; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14178; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14179; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14180; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14181; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14182; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14183; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14184; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14185; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14186; GFX11-CU-NEXT: s_endpgm 14187; 14188; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14189; GFX12-WGP: ; %bb.0: ; %entry 14190; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14191; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14192; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14193; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14194; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14195; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14196; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14197; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14198; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14199; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14200; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14201; GFX12-WGP-NEXT: s_endpgm 14202; 14203; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: 14204; GFX12-CU: ; %bb.0: ; %entry 14205; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14206; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14207; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14208; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14209; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14210; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14211; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14212; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14213; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14214; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14215; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14216; GFX12-CU-NEXT: s_endpgm 14217 ptr %out, i32 %in, i32 %old) { 14218entry: 14219 %gep = getelementptr i32, ptr %out, i32 4 14220 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 14221 ret void 14222} 14223 14224define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( 14225; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14226; GFX7: ; %bb.0: ; %entry 14227; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14228; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14229; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14230; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14231; GFX7-NEXT: s_mov_b64 s[10:11], 16 14232; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14233; GFX7-NEXT: s_mov_b32 s4, s8 14234; GFX7-NEXT: s_mov_b32 s5, s9 14235; GFX7-NEXT: s_mov_b32 s9, s10 14236; GFX7-NEXT: s_mov_b32 s8, s11 14237; GFX7-NEXT: s_add_u32 s4, s4, s9 14238; GFX7-NEXT: s_addc_u32 s8, s5, s8 14239; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14240; GFX7-NEXT: s_mov_b32 s5, s8 14241; GFX7-NEXT: v_mov_b32_e32 v2, s7 14242; GFX7-NEXT: v_mov_b32_e32 v0, s6 14243; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14244; GFX7-NEXT: v_mov_b32_e32 v3, v0 14245; GFX7-NEXT: v_mov_b32_e32 v0, s4 14246; GFX7-NEXT: v_mov_b32_e32 v1, s5 14247; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14248; GFX7-NEXT: s_endpgm 14249; 14250; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14251; GFX10-WGP: ; %bb.0: ; %entry 14252; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14253; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14254; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14255; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14256; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14257; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14258; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14259; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14260; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14261; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14262; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14263; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14264; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14265; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14266; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14267; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14268; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14269; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14270; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14271; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14272; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14273; GFX10-WGP-NEXT: s_endpgm 14274; 14275; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14276; GFX10-CU: ; %bb.0: ; %entry 14277; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14278; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14279; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14280; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14281; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14282; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14283; GFX10-CU-NEXT: s_mov_b32 s4, s8 14284; GFX10-CU-NEXT: s_mov_b32 s5, s9 14285; GFX10-CU-NEXT: s_mov_b32 s9, s10 14286; GFX10-CU-NEXT: s_mov_b32 s8, s11 14287; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14288; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14289; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14290; GFX10-CU-NEXT: s_mov_b32 s5, s8 14291; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14292; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14293; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14294; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14295; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14296; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14297; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14298; GFX10-CU-NEXT: s_endpgm 14299; 14300; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14301; SKIP-CACHE-INV: ; %bb.0: ; %entry 14302; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14303; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14304; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14305; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14306; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14307; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14308; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14309; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14310; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14311; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14312; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14313; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14314; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14315; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14316; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14317; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14318; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14319; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14320; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14321; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14322; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14323; SKIP-CACHE-INV-NEXT: s_endpgm 14324; 14325; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14326; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14327; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14328; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14329; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14330; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14331; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14332; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14333; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14334; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14335; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14336; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14337; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14338; 14339; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14340; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14341; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14342; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14343; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14344; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14345; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14346; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14347; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14348; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14349; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14350; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14351; GFX90A-TGSPLIT-NEXT: s_endpgm 14352; 14353; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14354; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14355; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14356; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14357; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14358; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14359; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14360; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14361; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14362; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14363; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14364; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14365; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14366; 14367; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14368; GFX940-TGSPLIT: ; %bb.0: ; %entry 14369; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14370; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14371; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14372; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14373; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14374; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14375; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14376; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14377; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14378; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14379; GFX940-TGSPLIT-NEXT: s_endpgm 14380; 14381; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14382; GFX11-WGP: ; %bb.0: ; %entry 14383; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14384; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14385; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14386; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14387; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14388; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14389; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14390; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14391; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14392; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14393; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14394; GFX11-WGP-NEXT: s_endpgm 14395; 14396; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14397; GFX11-CU: ; %bb.0: ; %entry 14398; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14399; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14400; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14401; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14402; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14403; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14404; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14405; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14406; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14407; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14408; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14409; GFX11-CU-NEXT: s_endpgm 14410; 14411; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14412; GFX12-WGP: ; %bb.0: ; %entry 14413; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14414; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14415; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14416; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14417; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14418; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14419; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14420; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14421; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14422; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14423; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14424; GFX12-WGP-NEXT: s_endpgm 14425; 14426; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: 14427; GFX12-CU: ; %bb.0: ; %entry 14428; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14429; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14430; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14431; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14432; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14433; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14434; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14435; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14436; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14437; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14438; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14439; GFX12-CU-NEXT: s_endpgm 14440 ptr %out, i32 %in, i32 %old) { 14441entry: 14442 %gep = getelementptr i32, ptr %out, i32 4 14443 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 14444 ret void 14445} 14446 14447define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( 14448; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14449; GFX7: ; %bb.0: ; %entry 14450; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14451; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14452; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14453; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14454; GFX7-NEXT: s_mov_b64 s[10:11], 16 14455; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14456; GFX7-NEXT: s_mov_b32 s4, s8 14457; GFX7-NEXT: s_mov_b32 s5, s9 14458; GFX7-NEXT: s_mov_b32 s9, s10 14459; GFX7-NEXT: s_mov_b32 s8, s11 14460; GFX7-NEXT: s_add_u32 s4, s4, s9 14461; GFX7-NEXT: s_addc_u32 s8, s5, s8 14462; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14463; GFX7-NEXT: s_mov_b32 s5, s8 14464; GFX7-NEXT: v_mov_b32_e32 v2, s7 14465; GFX7-NEXT: v_mov_b32_e32 v0, s6 14466; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14467; GFX7-NEXT: v_mov_b32_e32 v3, v0 14468; GFX7-NEXT: v_mov_b32_e32 v0, s4 14469; GFX7-NEXT: v_mov_b32_e32 v1, s5 14470; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14471; GFX7-NEXT: s_endpgm 14472; 14473; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14474; GFX10-WGP: ; %bb.0: ; %entry 14475; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14476; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14477; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14478; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14479; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14480; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14481; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14482; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14483; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14484; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14485; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14486; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14487; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14488; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14489; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14490; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14491; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14492; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14493; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14494; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14495; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14496; GFX10-WGP-NEXT: s_endpgm 14497; 14498; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14499; GFX10-CU: ; %bb.0: ; %entry 14500; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14501; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14502; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14503; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14504; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14505; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14506; GFX10-CU-NEXT: s_mov_b32 s4, s8 14507; GFX10-CU-NEXT: s_mov_b32 s5, s9 14508; GFX10-CU-NEXT: s_mov_b32 s9, s10 14509; GFX10-CU-NEXT: s_mov_b32 s8, s11 14510; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14511; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14512; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14513; GFX10-CU-NEXT: s_mov_b32 s5, s8 14514; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14515; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14516; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14517; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14518; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14519; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14520; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14521; GFX10-CU-NEXT: s_endpgm 14522; 14523; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14524; SKIP-CACHE-INV: ; %bb.0: ; %entry 14525; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14526; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14527; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14528; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14529; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14530; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14531; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14532; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14533; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14534; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14535; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14536; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14537; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14538; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14541; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14542; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14543; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14544; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14545; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14546; SKIP-CACHE-INV-NEXT: s_endpgm 14547; 14548; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14549; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14550; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14551; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14552; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14553; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14554; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14555; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14556; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14557; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14558; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14559; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14560; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14561; 14562; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14563; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14564; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14565; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14566; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14567; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14568; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14569; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14570; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14571; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14572; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14573; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14574; GFX90A-TGSPLIT-NEXT: s_endpgm 14575; 14576; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14577; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14578; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14579; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14580; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14581; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14582; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14583; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14584; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14585; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14586; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14587; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14588; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14589; 14590; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14591; GFX940-TGSPLIT: ; %bb.0: ; %entry 14592; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14593; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14594; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14595; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14596; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14597; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14598; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14599; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14600; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14601; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14602; GFX940-TGSPLIT-NEXT: s_endpgm 14603; 14604; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14605; GFX11-WGP: ; %bb.0: ; %entry 14606; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14607; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14608; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14609; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14610; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14611; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14612; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14613; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14614; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14615; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14616; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14617; GFX11-WGP-NEXT: s_endpgm 14618; 14619; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14620; GFX11-CU: ; %bb.0: ; %entry 14621; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14622; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14623; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14624; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14625; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14626; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14627; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14628; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14629; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14630; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14631; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14632; GFX11-CU-NEXT: s_endpgm 14633; 14634; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14635; GFX12-WGP: ; %bb.0: ; %entry 14636; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14637; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14638; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14639; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14640; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14641; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14642; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14643; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14644; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14645; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14646; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14647; GFX12-WGP-NEXT: s_endpgm 14648; 14649; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: 14650; GFX12-CU: ; %bb.0: ; %entry 14651; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14652; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14653; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14654; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14655; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14656; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14657; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14658; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14659; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14660; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14661; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14662; GFX12-CU-NEXT: s_endpgm 14663 ptr %out, i32 %in, i32 %old) { 14664entry: 14665 %gep = getelementptr i32, ptr %out, i32 4 14666 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 14667 ret void 14668} 14669 14670define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( 14671; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14672; GFX7: ; %bb.0: ; %entry 14673; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14674; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14675; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14676; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14677; GFX7-NEXT: s_mov_b64 s[10:11], 16 14678; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14679; GFX7-NEXT: s_mov_b32 s4, s8 14680; GFX7-NEXT: s_mov_b32 s5, s9 14681; GFX7-NEXT: s_mov_b32 s9, s10 14682; GFX7-NEXT: s_mov_b32 s8, s11 14683; GFX7-NEXT: s_add_u32 s4, s4, s9 14684; GFX7-NEXT: s_addc_u32 s8, s5, s8 14685; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14686; GFX7-NEXT: s_mov_b32 s5, s8 14687; GFX7-NEXT: v_mov_b32_e32 v2, s7 14688; GFX7-NEXT: v_mov_b32_e32 v0, s6 14689; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14690; GFX7-NEXT: v_mov_b32_e32 v3, v0 14691; GFX7-NEXT: v_mov_b32_e32 v0, s4 14692; GFX7-NEXT: v_mov_b32_e32 v1, s5 14693; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14694; GFX7-NEXT: s_endpgm 14695; 14696; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14697; GFX10-WGP: ; %bb.0: ; %entry 14698; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14699; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14700; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14701; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14702; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14703; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14704; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14705; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14706; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14707; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14708; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14709; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14710; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14711; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14712; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14713; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14714; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14715; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14716; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14717; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14718; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14719; GFX10-WGP-NEXT: s_endpgm 14720; 14721; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14722; GFX10-CU: ; %bb.0: ; %entry 14723; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14724; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14725; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14726; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14727; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14728; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14729; GFX10-CU-NEXT: s_mov_b32 s4, s8 14730; GFX10-CU-NEXT: s_mov_b32 s5, s9 14731; GFX10-CU-NEXT: s_mov_b32 s9, s10 14732; GFX10-CU-NEXT: s_mov_b32 s8, s11 14733; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14734; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14735; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14736; GFX10-CU-NEXT: s_mov_b32 s5, s8 14737; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14738; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14739; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14740; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14741; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14742; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14743; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14744; GFX10-CU-NEXT: s_endpgm 14745; 14746; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14747; SKIP-CACHE-INV: ; %bb.0: ; %entry 14748; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14749; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14750; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14751; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14752; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14753; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14754; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14755; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14756; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14757; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14758; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14759; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14760; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14761; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14762; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14763; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14764; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14765; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14766; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14767; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14768; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14769; SKIP-CACHE-INV-NEXT: s_endpgm 14770; 14771; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14772; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14773; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14774; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14775; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14776; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14777; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14778; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14779; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14780; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14781; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14782; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14783; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 14784; 14785; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14786; GFX90A-TGSPLIT: ; %bb.0: ; %entry 14787; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14788; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14789; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14790; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14791; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 14792; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 14793; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14794; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14795; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 14796; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14797; GFX90A-TGSPLIT-NEXT: s_endpgm 14798; 14799; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14800; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 14801; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14802; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14803; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14804; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14805; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14806; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14807; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14808; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14809; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14810; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14811; GFX940-NOTTGSPLIT-NEXT: s_endpgm 14812; 14813; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14814; GFX940-TGSPLIT: ; %bb.0: ; %entry 14815; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14816; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 14817; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 14818; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 14819; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 14820; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 14821; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14822; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 14823; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 14824; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 14825; GFX940-TGSPLIT-NEXT: s_endpgm 14826; 14827; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14828; GFX11-WGP: ; %bb.0: ; %entry 14829; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14830; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14831; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14832; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 14833; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 14834; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 14835; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14836; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 14837; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 14838; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 14839; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14840; GFX11-WGP-NEXT: s_endpgm 14841; 14842; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14843; GFX11-CU: ; %bb.0: ; %entry 14844; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14845; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14846; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14847; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 14848; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 14849; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 14850; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14851; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 14852; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 14853; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 14854; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14855; GFX11-CU-NEXT: s_endpgm 14856; 14857; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14858; GFX12-WGP: ; %bb.0: ; %entry 14859; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14860; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 14861; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 14862; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 14863; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 14864; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 14865; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14866; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 14867; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 14868; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 14869; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14870; GFX12-WGP-NEXT: s_endpgm 14871; 14872; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: 14873; GFX12-CU: ; %bb.0: ; %entry 14874; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 14875; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 14876; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 14877; GFX12-CU-NEXT: s_wait_kmcnt 0x0 14878; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 14879; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 14880; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14881; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 14882; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 14883; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 14884; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 14885; GFX12-CU-NEXT: s_endpgm 14886 ptr %out, i32 %in, i32 %old) { 14887entry: 14888 %gep = getelementptr i32, ptr %out, i32 4 14889 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 14890 ret void 14891} 14892 14893define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( 14894; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 14895; GFX7: ; %bb.0: ; %entry 14896; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 14897; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14898; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 14899; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 14900; GFX7-NEXT: s_mov_b64 s[10:11], 16 14901; GFX7-NEXT: s_waitcnt lgkmcnt(0) 14902; GFX7-NEXT: s_mov_b32 s4, s8 14903; GFX7-NEXT: s_mov_b32 s5, s9 14904; GFX7-NEXT: s_mov_b32 s9, s10 14905; GFX7-NEXT: s_mov_b32 s8, s11 14906; GFX7-NEXT: s_add_u32 s4, s4, s9 14907; GFX7-NEXT: s_addc_u32 s8, s5, s8 14908; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14909; GFX7-NEXT: s_mov_b32 s5, s8 14910; GFX7-NEXT: v_mov_b32_e32 v2, s7 14911; GFX7-NEXT: v_mov_b32_e32 v0, s6 14912; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14913; GFX7-NEXT: v_mov_b32_e32 v3, v0 14914; GFX7-NEXT: v_mov_b32_e32 v0, s4 14915; GFX7-NEXT: v_mov_b32_e32 v1, s5 14916; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14917; GFX7-NEXT: s_endpgm 14918; 14919; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 14920; GFX10-WGP: ; %bb.0: ; %entry 14921; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 14922; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14923; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 14924; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 14925; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 14926; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 14927; GFX10-WGP-NEXT: s_mov_b32 s4, s8 14928; GFX10-WGP-NEXT: s_mov_b32 s5, s9 14929; GFX10-WGP-NEXT: s_mov_b32 s9, s10 14930; GFX10-WGP-NEXT: s_mov_b32 s8, s11 14931; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 14932; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 14933; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14934; GFX10-WGP-NEXT: s_mov_b32 s5, s8 14935; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 14936; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 14937; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14938; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 14939; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 14940; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 14941; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14942; GFX10-WGP-NEXT: s_endpgm 14943; 14944; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 14945; GFX10-CU: ; %bb.0: ; %entry 14946; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 14947; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 14948; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 14949; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 14950; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 14951; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 14952; GFX10-CU-NEXT: s_mov_b32 s4, s8 14953; GFX10-CU-NEXT: s_mov_b32 s5, s9 14954; GFX10-CU-NEXT: s_mov_b32 s9, s10 14955; GFX10-CU-NEXT: s_mov_b32 s8, s11 14956; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 14957; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 14958; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 14959; GFX10-CU-NEXT: s_mov_b32 s5, s8 14960; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 14961; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 14962; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14963; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 14964; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 14965; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 14966; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14967; GFX10-CU-NEXT: s_endpgm 14968; 14969; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 14970; SKIP-CACHE-INV: ; %bb.0: ; %entry 14971; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 14972; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 14973; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 14974; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 14975; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 14976; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 14977; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 14978; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 14979; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 14980; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 14981; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 14982; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 14983; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 14984; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 14985; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 14986; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 14987; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 14988; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 14989; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 14990; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 14991; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 14992; SKIP-CACHE-INV-NEXT: s_endpgm 14993; 14994; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 14995; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 14996; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 14997; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 14998; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 14999; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15000; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15001; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15002; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15003; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15004; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15005; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15006; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15007; 15008; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15009; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15010; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15011; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15012; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15013; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15014; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15015; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15016; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15017; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15018; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15019; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15020; GFX90A-TGSPLIT-NEXT: s_endpgm 15021; 15022; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15023; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15024; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15025; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15026; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15027; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15028; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15029; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15030; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15031; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15032; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15033; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15034; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15035; 15036; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15037; GFX940-TGSPLIT: ; %bb.0: ; %entry 15038; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15039; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15040; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15041; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15042; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15043; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15044; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15045; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15046; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15047; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15048; GFX940-TGSPLIT-NEXT: s_endpgm 15049; 15050; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15051; GFX11-WGP: ; %bb.0: ; %entry 15052; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15053; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15054; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15055; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15056; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15057; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15058; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15059; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15060; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15061; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15062; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15063; GFX11-WGP-NEXT: s_endpgm 15064; 15065; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15066; GFX11-CU: ; %bb.0: ; %entry 15067; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15068; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15069; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15070; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15071; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15072; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15073; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15074; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15075; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15076; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15077; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15078; GFX11-CU-NEXT: s_endpgm 15079; 15080; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15081; GFX12-WGP: ; %bb.0: ; %entry 15082; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15083; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15084; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15085; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15086; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15087; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15088; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15089; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15090; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15091; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15092; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15093; GFX12-WGP-NEXT: s_endpgm 15094; 15095; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: 15096; GFX12-CU: ; %bb.0: ; %entry 15097; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15098; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15099; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15100; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15101; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15102; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15103; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15104; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15105; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15106; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15107; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15108; GFX12-CU-NEXT: s_endpgm 15109 ptr %out, i32 %in, i32 %old) { 15110entry: 15111 %gep = getelementptr i32, ptr %out, i32 4 15112 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 15113 ret void 15114} 15115 15116define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( 15117; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15118; GFX7: ; %bb.0: ; %entry 15119; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15120; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15121; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15122; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15123; GFX7-NEXT: s_mov_b64 s[10:11], 16 15124; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15125; GFX7-NEXT: s_mov_b32 s4, s8 15126; GFX7-NEXT: s_mov_b32 s5, s9 15127; GFX7-NEXT: s_mov_b32 s9, s10 15128; GFX7-NEXT: s_mov_b32 s8, s11 15129; GFX7-NEXT: s_add_u32 s4, s4, s9 15130; GFX7-NEXT: s_addc_u32 s8, s5, s8 15131; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15132; GFX7-NEXT: s_mov_b32 s5, s8 15133; GFX7-NEXT: v_mov_b32_e32 v2, s7 15134; GFX7-NEXT: v_mov_b32_e32 v0, s6 15135; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15136; GFX7-NEXT: v_mov_b32_e32 v3, v0 15137; GFX7-NEXT: v_mov_b32_e32 v0, s4 15138; GFX7-NEXT: v_mov_b32_e32 v1, s5 15139; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15140; GFX7-NEXT: s_endpgm 15141; 15142; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15143; GFX10-WGP: ; %bb.0: ; %entry 15144; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15145; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15146; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15147; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15148; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15149; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15150; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15151; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15152; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15153; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15154; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15155; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15156; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15157; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15158; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15159; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15160; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15161; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15162; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15163; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15164; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15165; GFX10-WGP-NEXT: s_endpgm 15166; 15167; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15168; GFX10-CU: ; %bb.0: ; %entry 15169; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15170; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15171; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15172; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15173; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15174; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15175; GFX10-CU-NEXT: s_mov_b32 s4, s8 15176; GFX10-CU-NEXT: s_mov_b32 s5, s9 15177; GFX10-CU-NEXT: s_mov_b32 s9, s10 15178; GFX10-CU-NEXT: s_mov_b32 s8, s11 15179; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15180; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15181; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15182; GFX10-CU-NEXT: s_mov_b32 s5, s8 15183; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15184; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15185; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15186; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15187; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15188; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15189; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15190; GFX10-CU-NEXT: s_endpgm 15191; 15192; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15193; SKIP-CACHE-INV: ; %bb.0: ; %entry 15194; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15195; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15196; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15197; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15198; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15199; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15200; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15201; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15202; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15203; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15204; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15205; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15206; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15207; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15208; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15209; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15210; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15211; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15212; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15213; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15214; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15215; SKIP-CACHE-INV-NEXT: s_endpgm 15216; 15217; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15218; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15219; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15220; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15221; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15222; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15223; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15224; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15225; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15226; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15227; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15228; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15229; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15230; 15231; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15232; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15233; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15234; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15235; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15236; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15237; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15238; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15239; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15240; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15241; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15242; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15243; GFX90A-TGSPLIT-NEXT: s_endpgm 15244; 15245; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15246; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15247; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15248; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15249; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15250; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15251; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15252; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15253; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15254; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15255; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15256; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15257; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15258; 15259; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15260; GFX940-TGSPLIT: ; %bb.0: ; %entry 15261; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15262; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15263; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15264; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15265; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15266; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15267; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15268; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15269; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15270; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15271; GFX940-TGSPLIT-NEXT: s_endpgm 15272; 15273; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15274; GFX11-WGP: ; %bb.0: ; %entry 15275; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15276; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15277; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15278; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15279; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15280; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15281; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15282; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15283; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15284; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15285; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15286; GFX11-WGP-NEXT: s_endpgm 15287; 15288; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15289; GFX11-CU: ; %bb.0: ; %entry 15290; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15291; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15292; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15293; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15294; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15295; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15296; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15297; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15298; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15299; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15300; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15301; GFX11-CU-NEXT: s_endpgm 15302; 15303; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15304; GFX12-WGP: ; %bb.0: ; %entry 15305; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15306; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15307; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15308; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15309; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15310; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15311; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15312; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15313; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15314; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15315; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15316; GFX12-WGP-NEXT: s_endpgm 15317; 15318; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: 15319; GFX12-CU: ; %bb.0: ; %entry 15320; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15321; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15322; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15323; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15324; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15325; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15326; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15327; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15328; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15329; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15330; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15331; GFX12-CU-NEXT: s_endpgm 15332 ptr %out, i32 %in, i32 %old) { 15333entry: 15334 %gep = getelementptr i32, ptr %out, i32 4 15335 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 15336 ret void 15337} 15338 15339define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( 15340; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15341; GFX7: ; %bb.0: ; %entry 15342; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 15343; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15344; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 15345; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 15346; GFX7-NEXT: s_mov_b64 s[10:11], 16 15347; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15348; GFX7-NEXT: s_mov_b32 s4, s8 15349; GFX7-NEXT: s_mov_b32 s5, s9 15350; GFX7-NEXT: s_mov_b32 s9, s10 15351; GFX7-NEXT: s_mov_b32 s8, s11 15352; GFX7-NEXT: s_add_u32 s4, s4, s9 15353; GFX7-NEXT: s_addc_u32 s8, s5, s8 15354; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15355; GFX7-NEXT: s_mov_b32 s5, s8 15356; GFX7-NEXT: v_mov_b32_e32 v2, s7 15357; GFX7-NEXT: v_mov_b32_e32 v0, s6 15358; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15359; GFX7-NEXT: v_mov_b32_e32 v3, v0 15360; GFX7-NEXT: v_mov_b32_e32 v0, s4 15361; GFX7-NEXT: v_mov_b32_e32 v1, s5 15362; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15363; GFX7-NEXT: s_endpgm 15364; 15365; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15366; GFX10-WGP: ; %bb.0: ; %entry 15367; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] 15368; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15369; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 15370; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc 15371; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 15372; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15373; GFX10-WGP-NEXT: s_mov_b32 s4, s8 15374; GFX10-WGP-NEXT: s_mov_b32 s5, s9 15375; GFX10-WGP-NEXT: s_mov_b32 s9, s10 15376; GFX10-WGP-NEXT: s_mov_b32 s8, s11 15377; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 15378; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 15379; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15380; GFX10-WGP-NEXT: s_mov_b32 s5, s8 15381; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 15382; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15383; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15384; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15385; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15386; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15387; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15388; GFX10-WGP-NEXT: s_endpgm 15389; 15390; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15391; GFX10-CU: ; %bb.0: ; %entry 15392; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] 15393; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 15394; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 15395; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc 15396; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 15397; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15398; GFX10-CU-NEXT: s_mov_b32 s4, s8 15399; GFX10-CU-NEXT: s_mov_b32 s5, s9 15400; GFX10-CU-NEXT: s_mov_b32 s9, s10 15401; GFX10-CU-NEXT: s_mov_b32 s8, s11 15402; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 15403; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 15404; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 15405; GFX10-CU-NEXT: s_mov_b32 s5, s8 15406; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 15407; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15408; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15409; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15410; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15411; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15412; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15413; GFX10-CU-NEXT: s_endpgm 15414; 15415; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15416; SKIP-CACHE-INV: ; %bb.0: ; %entry 15417; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 15418; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 15419; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 15420; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 15421; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 15422; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15423; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 15424; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 15425; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 15426; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 15427; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 15428; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 15429; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 15430; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 15431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 15432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15433; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15434; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15436; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15437; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 15438; SKIP-CACHE-INV-NEXT: s_endpgm 15439; 15440; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15441; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15442; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15443; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15444; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15445; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15446; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15447; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15448; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15449; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15450; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15451; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15452; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15453; 15454; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15455; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15456; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15457; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15458; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15459; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15460; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15461; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15462; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15463; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15464; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15465; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15466; GFX90A-TGSPLIT-NEXT: s_endpgm 15467; 15468; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15469; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15470; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15471; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15472; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15473; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15474; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15475; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15476; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15477; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15478; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15479; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15480; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15481; 15482; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15483; GFX940-TGSPLIT: ; %bb.0: ; %entry 15484; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15485; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15486; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15487; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15488; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15489; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15490; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15491; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15492; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15493; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 15494; GFX940-TGSPLIT-NEXT: s_endpgm 15495; 15496; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15497; GFX11-WGP: ; %bb.0: ; %entry 15498; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15499; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15500; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15501; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15502; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15503; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15504; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15505; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15506; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15507; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15508; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15509; GFX11-WGP-NEXT: s_endpgm 15510; 15511; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15512; GFX11-CU: ; %bb.0: ; %entry 15513; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15514; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15515; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15516; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15517; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15518; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15519; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15520; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15521; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15522; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15523; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15524; GFX11-CU-NEXT: s_endpgm 15525; 15526; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15527; GFX12-WGP: ; %bb.0: ; %entry 15528; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15529; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15530; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15531; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15532; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15533; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15534; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15535; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15536; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15537; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15538; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15539; GFX12-WGP-NEXT: s_endpgm 15540; 15541; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: 15542; GFX12-CU: ; %bb.0: ; %entry 15543; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15544; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15545; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15546; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15547; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15548; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15549; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15550; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15551; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15552; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15553; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 15554; GFX12-CU-NEXT: s_endpgm 15555 ptr %out, i32 %in, i32 %old) { 15556entry: 15557 %gep = getelementptr i32, ptr %out, i32 4 15558 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 15559 ret void 15560} 15561 15562define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( 15563; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15564; GFX7: ; %bb.0: ; %entry 15565; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 15566; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15567; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 15568; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 15569; GFX7-NEXT: s_mov_b64 s[12:13], 16 15570; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15571; GFX7-NEXT: s_mov_b32 s6, s4 15572; GFX7-NEXT: s_mov_b32 s7, s5 15573; GFX7-NEXT: s_mov_b32 s11, s12 15574; GFX7-NEXT: s_mov_b32 s10, s13 15575; GFX7-NEXT: s_add_u32 s6, s6, s11 15576; GFX7-NEXT: s_addc_u32 s10, s7, s10 15577; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15578; GFX7-NEXT: s_mov_b32 s7, s10 15579; GFX7-NEXT: v_mov_b32_e32 v2, s9 15580; GFX7-NEXT: v_mov_b32_e32 v0, s8 15581; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15582; GFX7-NEXT: v_mov_b32_e32 v3, v0 15583; GFX7-NEXT: v_mov_b32_e32 v0, s6 15584; GFX7-NEXT: v_mov_b32_e32 v1, s7 15585; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15586; GFX7-NEXT: v_mov_b32_e32 v0, s4 15587; GFX7-NEXT: v_mov_b32_e32 v1, s5 15588; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15589; GFX7-NEXT: flat_store_dword v[0:1], v2 15590; GFX7-NEXT: s_endpgm 15591; 15592; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15593; GFX10-WGP: ; %bb.0: ; %entry 15594; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 15595; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15596; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 15597; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 15598; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 15599; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15600; GFX10-WGP-NEXT: s_mov_b32 s6, s4 15601; GFX10-WGP-NEXT: s_mov_b32 s7, s5 15602; GFX10-WGP-NEXT: s_mov_b32 s11, s12 15603; GFX10-WGP-NEXT: s_mov_b32 s10, s13 15604; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 15605; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 15606; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15607; GFX10-WGP-NEXT: s_mov_b32 s7, s10 15608; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 15609; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 15610; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15611; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15612; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15613; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 15614; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15615; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15616; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15617; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15618; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 15619; GFX10-WGP-NEXT: s_endpgm 15620; 15621; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15622; GFX10-CU: ; %bb.0: ; %entry 15623; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 15624; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15625; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 15626; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 15627; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 15628; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15629; GFX10-CU-NEXT: s_mov_b32 s6, s4 15630; GFX10-CU-NEXT: s_mov_b32 s7, s5 15631; GFX10-CU-NEXT: s_mov_b32 s11, s12 15632; GFX10-CU-NEXT: s_mov_b32 s10, s13 15633; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 15634; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 15635; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15636; GFX10-CU-NEXT: s_mov_b32 s7, s10 15637; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 15638; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 15639; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15640; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15641; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15642; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 15643; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15644; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15645; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15646; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15647; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 15648; GFX10-CU-NEXT: s_endpgm 15649; 15650; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15651; SKIP-CACHE-INV: ; %bb.0: ; %entry 15652; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 15653; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 15654; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 15655; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 15656; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 15657; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15658; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 15659; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 15660; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 15661; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 15662; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 15663; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 15664; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 15665; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 15666; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 15667; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 15668; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15669; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15670; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15671; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 15672; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15673; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15674; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15675; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15676; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 15677; SKIP-CACHE-INV-NEXT: s_endpgm 15678; 15679; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15680; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15681; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15682; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15683; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15684; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15685; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15686; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15687; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15688; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15689; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15690; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 15691; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15692; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15693; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 15694; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15695; 15696; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15697; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15698; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15699; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15700; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15701; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15702; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15703; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15704; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15705; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15706; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15707; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 15708; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15709; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15710; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 15711; GFX90A-TGSPLIT-NEXT: s_endpgm 15712; 15713; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15714; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15715; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15716; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15717; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15718; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15719; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15720; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15721; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15722; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15723; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15724; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 15725; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15726; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15727; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 15728; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15729; 15730; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15731; GFX940-TGSPLIT: ; %bb.0: ; %entry 15732; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15733; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15734; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15735; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15736; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15737; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15738; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15739; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15740; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15741; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 15742; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15743; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15744; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 15745; GFX940-TGSPLIT-NEXT: s_endpgm 15746; 15747; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15748; GFX11-WGP: ; %bb.0: ; %entry 15749; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15750; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15751; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15752; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 15753; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 15754; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 15755; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15756; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 15757; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15758; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15759; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 15760; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 15761; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 15762; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15763; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 15764; GFX11-WGP-NEXT: s_endpgm 15765; 15766; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15767; GFX11-CU: ; %bb.0: ; %entry 15768; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15769; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15770; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15771; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 15772; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 15773; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 15774; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15775; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 15776; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15777; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15778; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 15779; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 15780; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 15781; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15782; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 15783; GFX11-CU-NEXT: s_endpgm 15784; 15785; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15786; GFX12-WGP: ; %bb.0: ; %entry 15787; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15788; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 15789; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 15790; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 15791; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 15792; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 15793; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15794; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 15795; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15796; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15797; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 15798; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 15799; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 15800; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 15801; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 15802; GFX12-WGP-NEXT: s_endpgm 15803; 15804; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: 15805; GFX12-CU: ; %bb.0: ; %entry 15806; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 15807; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 15808; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 15809; GFX12-CU-NEXT: s_wait_kmcnt 0x0 15810; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 15811; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 15812; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15813; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 15814; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15815; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15816; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 15817; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 15818; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 15819; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 15820; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 15821; GFX12-CU-NEXT: s_endpgm 15822 ptr %out, i32 %in, i32 %old) { 15823entry: 15824 %gep = getelementptr i32, ptr %out, i32 4 15825 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic 15826 %val0 = extractvalue { i32, i1 } %val, 0 15827 store i32 %val0, ptr %out, align 4 15828 ret void 15829} 15830 15831define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( 15832; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15833; GFX7: ; %bb.0: ; %entry 15834; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 15835; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15836; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 15837; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 15838; GFX7-NEXT: s_mov_b64 s[12:13], 16 15839; GFX7-NEXT: s_waitcnt lgkmcnt(0) 15840; GFX7-NEXT: s_mov_b32 s6, s4 15841; GFX7-NEXT: s_mov_b32 s7, s5 15842; GFX7-NEXT: s_mov_b32 s11, s12 15843; GFX7-NEXT: s_mov_b32 s10, s13 15844; GFX7-NEXT: s_add_u32 s6, s6, s11 15845; GFX7-NEXT: s_addc_u32 s10, s7, s10 15846; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15847; GFX7-NEXT: s_mov_b32 s7, s10 15848; GFX7-NEXT: v_mov_b32_e32 v2, s9 15849; GFX7-NEXT: v_mov_b32_e32 v0, s8 15850; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15851; GFX7-NEXT: v_mov_b32_e32 v3, v0 15852; GFX7-NEXT: v_mov_b32_e32 v0, s6 15853; GFX7-NEXT: v_mov_b32_e32 v1, s7 15854; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15855; GFX7-NEXT: v_mov_b32_e32 v0, s4 15856; GFX7-NEXT: v_mov_b32_e32 v1, s5 15857; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15858; GFX7-NEXT: flat_store_dword v[0:1], v2 15859; GFX7-NEXT: s_endpgm 15860; 15861; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15862; GFX10-WGP: ; %bb.0: ; %entry 15863; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 15864; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15865; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 15866; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 15867; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 15868; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 15869; GFX10-WGP-NEXT: s_mov_b32 s6, s4 15870; GFX10-WGP-NEXT: s_mov_b32 s7, s5 15871; GFX10-WGP-NEXT: s_mov_b32 s11, s12 15872; GFX10-WGP-NEXT: s_mov_b32 s10, s13 15873; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 15874; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 15875; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15876; GFX10-WGP-NEXT: s_mov_b32 s7, s10 15877; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 15878; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 15879; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15880; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 15881; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 15882; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 15883; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15884; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 15885; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 15886; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15887; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 15888; GFX10-WGP-NEXT: s_endpgm 15889; 15890; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15891; GFX10-CU: ; %bb.0: ; %entry 15892; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 15893; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 15894; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 15895; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 15896; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 15897; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 15898; GFX10-CU-NEXT: s_mov_b32 s6, s4 15899; GFX10-CU-NEXT: s_mov_b32 s7, s5 15900; GFX10-CU-NEXT: s_mov_b32 s11, s12 15901; GFX10-CU-NEXT: s_mov_b32 s10, s13 15902; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 15903; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 15904; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 15905; GFX10-CU-NEXT: s_mov_b32 s7, s10 15906; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 15907; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 15908; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15909; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 15910; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 15911; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 15912; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15913; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 15914; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 15915; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15916; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 15917; GFX10-CU-NEXT: s_endpgm 15918; 15919; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15920; SKIP-CACHE-INV: ; %bb.0: ; %entry 15921; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 15922; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 15923; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 15924; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 15925; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 15926; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 15927; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 15928; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 15929; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 15930; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 15931; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 15932; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 15933; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 15934; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 15935; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 15936; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 15937; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15938; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 15939; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 15940; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 15941; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 15943; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 15944; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15945; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 15946; SKIP-CACHE-INV-NEXT: s_endpgm 15947; 15948; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15949; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 15950; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15951; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15952; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15953; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15954; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15955; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15956; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15957; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15958; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15959; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 15960; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15961; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15962; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 15963; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 15964; 15965; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15966; GFX90A-TGSPLIT: ; %bb.0: ; %entry 15967; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 15968; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 15969; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 15970; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15971; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 15972; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 15973; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15974; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15975; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15976; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 15977; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 15978; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 15979; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 15980; GFX90A-TGSPLIT-NEXT: s_endpgm 15981; 15982; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 15983; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 15984; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15985; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 15986; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 15987; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 15988; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 15989; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 15990; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 15991; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 15992; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15993; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 15994; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 15995; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15996; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 15997; GFX940-NOTTGSPLIT-NEXT: s_endpgm 15998; 15999; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 16000; GFX940-TGSPLIT: ; %bb.0: ; %entry 16001; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16002; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16003; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16004; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16005; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16006; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16007; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16008; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16009; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16010; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16011; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16012; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16013; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16014; GFX940-TGSPLIT-NEXT: s_endpgm 16015; 16016; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 16017; GFX11-WGP: ; %bb.0: ; %entry 16018; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16019; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16020; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16021; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16022; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16023; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16024; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16025; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16026; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16027; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16028; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16029; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16030; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16031; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16032; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16033; GFX11-WGP-NEXT: s_endpgm 16034; 16035; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 16036; GFX11-CU: ; %bb.0: ; %entry 16037; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16038; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16039; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16040; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16041; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16042; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16043; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16044; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16045; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16046; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16047; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16048; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16049; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16050; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16051; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16052; GFX11-CU-NEXT: s_endpgm 16053; 16054; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 16055; GFX12-WGP: ; %bb.0: ; %entry 16056; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16057; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16058; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16059; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16060; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16061; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16062; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16063; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16064; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16065; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16066; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16067; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16068; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16069; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16070; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16071; GFX12-WGP-NEXT: s_endpgm 16072; 16073; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: 16074; GFX12-CU: ; %bb.0: ; %entry 16075; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16076; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16077; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16078; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16079; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16080; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16081; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16082; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16083; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16084; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16085; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16086; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16087; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16088; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16089; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16090; GFX12-CU-NEXT: s_endpgm 16091 ptr %out, i32 %in, i32 %old) { 16092entry: 16093 %gep = getelementptr i32, ptr %out, i32 4 16094 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic 16095 %val0 = extractvalue { i32, i1 } %val, 0 16096 store i32 %val0, ptr %out, align 4 16097 ret void 16098} 16099 16100define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( 16101; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16102; GFX7: ; %bb.0: ; %entry 16103; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16104; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16105; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16106; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16107; GFX7-NEXT: s_mov_b64 s[12:13], 16 16108; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16109; GFX7-NEXT: s_mov_b32 s6, s4 16110; GFX7-NEXT: s_mov_b32 s7, s5 16111; GFX7-NEXT: s_mov_b32 s11, s12 16112; GFX7-NEXT: s_mov_b32 s10, s13 16113; GFX7-NEXT: s_add_u32 s6, s6, s11 16114; GFX7-NEXT: s_addc_u32 s10, s7, s10 16115; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16116; GFX7-NEXT: s_mov_b32 s7, s10 16117; GFX7-NEXT: v_mov_b32_e32 v2, s9 16118; GFX7-NEXT: v_mov_b32_e32 v0, s8 16119; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16120; GFX7-NEXT: v_mov_b32_e32 v3, v0 16121; GFX7-NEXT: v_mov_b32_e32 v0, s6 16122; GFX7-NEXT: v_mov_b32_e32 v1, s7 16123; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16124; GFX7-NEXT: v_mov_b32_e32 v0, s4 16125; GFX7-NEXT: v_mov_b32_e32 v1, s5 16126; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16127; GFX7-NEXT: flat_store_dword v[0:1], v2 16128; GFX7-NEXT: s_endpgm 16129; 16130; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16131; GFX10-WGP: ; %bb.0: ; %entry 16132; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16133; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16134; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16135; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16136; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16137; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16138; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16139; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16140; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16141; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16142; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16143; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16144; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16145; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16146; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16147; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16148; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16149; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16150; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16151; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16152; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16153; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16154; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16155; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16156; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16157; GFX10-WGP-NEXT: s_endpgm 16158; 16159; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16160; GFX10-CU: ; %bb.0: ; %entry 16161; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16162; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16163; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16164; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16165; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16166; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16167; GFX10-CU-NEXT: s_mov_b32 s6, s4 16168; GFX10-CU-NEXT: s_mov_b32 s7, s5 16169; GFX10-CU-NEXT: s_mov_b32 s11, s12 16170; GFX10-CU-NEXT: s_mov_b32 s10, s13 16171; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16172; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16173; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16174; GFX10-CU-NEXT: s_mov_b32 s7, s10 16175; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16176; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16177; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16178; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16179; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16180; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16181; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16182; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16183; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16184; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16185; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16186; GFX10-CU-NEXT: s_endpgm 16187; 16188; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16189; SKIP-CACHE-INV: ; %bb.0: ; %entry 16190; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16191; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16192; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16193; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16194; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16195; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16196; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16197; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16198; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16199; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16200; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16201; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16202; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16203; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16204; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16205; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16206; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16207; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16208; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16209; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16210; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16211; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16212; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16213; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16214; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16215; SKIP-CACHE-INV-NEXT: s_endpgm 16216; 16217; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16218; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16219; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16220; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16221; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16222; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16223; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16224; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16225; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16226; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16227; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16228; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16229; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16230; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16231; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16232; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16233; 16234; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16235; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16236; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16237; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16238; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16239; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16240; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16241; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16242; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16243; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16244; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16245; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16246; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16247; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16248; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16249; GFX90A-TGSPLIT-NEXT: s_endpgm 16250; 16251; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16252; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16253; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16254; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16255; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16256; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16257; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16258; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16259; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16260; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16261; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16262; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16263; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16264; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16265; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16266; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16267; 16268; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16269; GFX940-TGSPLIT: ; %bb.0: ; %entry 16270; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16271; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16272; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16273; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16274; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16275; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16276; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16277; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16278; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16279; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16280; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16281; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16282; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16283; GFX940-TGSPLIT-NEXT: s_endpgm 16284; 16285; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16286; GFX11-WGP: ; %bb.0: ; %entry 16287; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16288; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16289; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16290; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16291; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16292; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16293; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16294; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16295; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16296; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16297; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16298; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16299; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16300; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16301; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16302; GFX11-WGP-NEXT: s_endpgm 16303; 16304; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16305; GFX11-CU: ; %bb.0: ; %entry 16306; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16307; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16308; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16309; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16310; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16311; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16312; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16313; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16314; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16315; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16316; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16317; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16318; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16319; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16320; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16321; GFX11-CU-NEXT: s_endpgm 16322; 16323; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16324; GFX12-WGP: ; %bb.0: ; %entry 16325; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16326; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16327; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16328; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16329; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16330; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16331; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16332; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16333; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16334; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16335; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16336; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16337; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16338; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16339; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16340; GFX12-WGP-NEXT: s_endpgm 16341; 16342; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: 16343; GFX12-CU: ; %bb.0: ; %entry 16344; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16345; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16346; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16347; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16348; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16349; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16350; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16351; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16352; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16353; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16354; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16355; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16356; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16357; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16358; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16359; GFX12-CU-NEXT: s_endpgm 16360 ptr %out, i32 %in, i32 %old) { 16361entry: 16362 %gep = getelementptr i32, ptr %out, i32 4 16363 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic 16364 %val0 = extractvalue { i32, i1 } %val, 0 16365 store i32 %val0, ptr %out, align 4 16366 ret void 16367} 16368 16369define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( 16370; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16371; GFX7: ; %bb.0: ; %entry 16372; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16373; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16374; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16375; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16376; GFX7-NEXT: s_mov_b64 s[12:13], 16 16377; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16378; GFX7-NEXT: s_mov_b32 s6, s4 16379; GFX7-NEXT: s_mov_b32 s7, s5 16380; GFX7-NEXT: s_mov_b32 s11, s12 16381; GFX7-NEXT: s_mov_b32 s10, s13 16382; GFX7-NEXT: s_add_u32 s6, s6, s11 16383; GFX7-NEXT: s_addc_u32 s10, s7, s10 16384; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16385; GFX7-NEXT: s_mov_b32 s7, s10 16386; GFX7-NEXT: v_mov_b32_e32 v2, s9 16387; GFX7-NEXT: v_mov_b32_e32 v0, s8 16388; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16389; GFX7-NEXT: v_mov_b32_e32 v3, v0 16390; GFX7-NEXT: v_mov_b32_e32 v0, s6 16391; GFX7-NEXT: v_mov_b32_e32 v1, s7 16392; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16393; GFX7-NEXT: v_mov_b32_e32 v0, s4 16394; GFX7-NEXT: v_mov_b32_e32 v1, s5 16395; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16396; GFX7-NEXT: flat_store_dword v[0:1], v2 16397; GFX7-NEXT: s_endpgm 16398; 16399; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16400; GFX10-WGP: ; %bb.0: ; %entry 16401; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16402; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16403; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16404; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16405; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16406; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16407; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16408; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16409; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16410; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16411; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16412; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16413; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16414; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16415; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16416; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16417; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16418; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16419; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16420; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16421; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16422; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16423; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16424; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16425; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16426; GFX10-WGP-NEXT: s_endpgm 16427; 16428; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16429; GFX10-CU: ; %bb.0: ; %entry 16430; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16431; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16432; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16433; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16434; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16435; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16436; GFX10-CU-NEXT: s_mov_b32 s6, s4 16437; GFX10-CU-NEXT: s_mov_b32 s7, s5 16438; GFX10-CU-NEXT: s_mov_b32 s11, s12 16439; GFX10-CU-NEXT: s_mov_b32 s10, s13 16440; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16441; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16442; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16443; GFX10-CU-NEXT: s_mov_b32 s7, s10 16444; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16445; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16446; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16447; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16448; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16449; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16450; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16451; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16452; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16453; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16454; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16455; GFX10-CU-NEXT: s_endpgm 16456; 16457; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16458; SKIP-CACHE-INV: ; %bb.0: ; %entry 16459; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16460; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16461; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16462; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16463; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16464; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16465; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16466; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16467; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16468; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16469; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16470; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16471; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16472; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16473; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16475; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16476; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16477; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16478; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16479; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16480; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16481; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16482; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16483; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16484; SKIP-CACHE-INV-NEXT: s_endpgm 16485; 16486; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16487; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16488; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16489; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16490; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16491; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16492; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16493; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16494; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16495; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16496; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16497; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16498; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16499; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16500; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16501; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16502; 16503; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16504; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16505; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16506; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16507; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16508; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16509; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16510; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16511; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16512; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16513; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16514; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16515; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16516; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16517; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16518; GFX90A-TGSPLIT-NEXT: s_endpgm 16519; 16520; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16521; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16522; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16523; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16524; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16525; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16526; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16527; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16528; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16529; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16530; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16531; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16532; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16533; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16534; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16535; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16536; 16537; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16538; GFX940-TGSPLIT: ; %bb.0: ; %entry 16539; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16540; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16541; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16542; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16543; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16544; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16545; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16546; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16547; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16548; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16549; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16550; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16551; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16552; GFX940-TGSPLIT-NEXT: s_endpgm 16553; 16554; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16555; GFX11-WGP: ; %bb.0: ; %entry 16556; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16557; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16558; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16559; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16560; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16561; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16562; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16563; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16564; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16565; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16566; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16567; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16568; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16569; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16570; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16571; GFX11-WGP-NEXT: s_endpgm 16572; 16573; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16574; GFX11-CU: ; %bb.0: ; %entry 16575; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16576; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16577; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16578; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16579; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16580; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16581; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16582; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16583; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16584; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16585; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16586; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16587; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16588; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16589; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16590; GFX11-CU-NEXT: s_endpgm 16591; 16592; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16593; GFX12-WGP: ; %bb.0: ; %entry 16594; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16595; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16596; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16597; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16598; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16599; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16600; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16601; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16602; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16603; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16604; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16605; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16606; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16607; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16608; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16609; GFX12-WGP-NEXT: s_endpgm 16610; 16611; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: 16612; GFX12-CU: ; %bb.0: ; %entry 16613; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16614; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16615; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16616; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16617; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16618; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16619; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16620; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16621; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16622; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16623; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16624; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16625; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16626; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16627; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16628; GFX12-CU-NEXT: s_endpgm 16629 ptr %out, i32 %in, i32 %old) { 16630entry: 16631 %gep = getelementptr i32, ptr %out, i32 4 16632 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic 16633 %val0 = extractvalue { i32, i1 } %val, 0 16634 store i32 %val0, ptr %out, align 4 16635 ret void 16636} 16637 16638define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( 16639; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16640; GFX7: ; %bb.0: ; %entry 16641; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16642; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16643; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16644; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16645; GFX7-NEXT: s_mov_b64 s[12:13], 16 16646; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16647; GFX7-NEXT: s_mov_b32 s6, s4 16648; GFX7-NEXT: s_mov_b32 s7, s5 16649; GFX7-NEXT: s_mov_b32 s11, s12 16650; GFX7-NEXT: s_mov_b32 s10, s13 16651; GFX7-NEXT: s_add_u32 s6, s6, s11 16652; GFX7-NEXT: s_addc_u32 s10, s7, s10 16653; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16654; GFX7-NEXT: s_mov_b32 s7, s10 16655; GFX7-NEXT: v_mov_b32_e32 v2, s9 16656; GFX7-NEXT: v_mov_b32_e32 v0, s8 16657; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16658; GFX7-NEXT: v_mov_b32_e32 v3, v0 16659; GFX7-NEXT: v_mov_b32_e32 v0, s6 16660; GFX7-NEXT: v_mov_b32_e32 v1, s7 16661; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16662; GFX7-NEXT: v_mov_b32_e32 v0, s4 16663; GFX7-NEXT: v_mov_b32_e32 v1, s5 16664; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16665; GFX7-NEXT: flat_store_dword v[0:1], v2 16666; GFX7-NEXT: s_endpgm 16667; 16668; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16669; GFX10-WGP: ; %bb.0: ; %entry 16670; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16671; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16672; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16673; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16674; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16675; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16676; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16677; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16678; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16679; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16680; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16681; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16682; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16683; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16684; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16685; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16686; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16687; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16688; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16689; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16690; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16691; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16692; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16693; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16694; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16695; GFX10-WGP-NEXT: s_endpgm 16696; 16697; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16698; GFX10-CU: ; %bb.0: ; %entry 16699; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16700; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16701; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16702; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16703; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16704; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16705; GFX10-CU-NEXT: s_mov_b32 s6, s4 16706; GFX10-CU-NEXT: s_mov_b32 s7, s5 16707; GFX10-CU-NEXT: s_mov_b32 s11, s12 16708; GFX10-CU-NEXT: s_mov_b32 s10, s13 16709; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16710; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16711; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16712; GFX10-CU-NEXT: s_mov_b32 s7, s10 16713; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16714; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16715; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16716; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16717; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16718; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16719; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16720; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16721; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16722; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16723; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16724; GFX10-CU-NEXT: s_endpgm 16725; 16726; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16727; SKIP-CACHE-INV: ; %bb.0: ; %entry 16728; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16729; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16730; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 16731; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 16732; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 16733; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 16734; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 16735; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 16736; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 16737; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 16738; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 16739; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 16740; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 16741; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 16742; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 16743; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 16744; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16745; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 16746; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 16747; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 16748; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16749; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 16750; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 16751; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16752; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 16753; SKIP-CACHE-INV-NEXT: s_endpgm 16754; 16755; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16756; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 16757; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16758; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16759; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16760; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16761; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16762; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16763; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16764; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16765; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16766; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16767; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16768; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16769; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 16770; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 16771; 16772; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16773; GFX90A-TGSPLIT: ; %bb.0: ; %entry 16774; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 16775; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 16776; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 16777; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16778; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 16779; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 16780; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16781; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16782; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16783; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 16784; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 16785; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16786; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 16787; GFX90A-TGSPLIT-NEXT: s_endpgm 16788; 16789; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16790; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 16791; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16792; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16793; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16794; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16795; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16796; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16797; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16798; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16799; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16800; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16801; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16802; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16803; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16804; GFX940-NOTTGSPLIT-NEXT: s_endpgm 16805; 16806; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16807; GFX940-TGSPLIT: ; %bb.0: ; %entry 16808; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16809; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 16810; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 16811; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 16812; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 16813; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 16814; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16815; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 16816; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16817; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 16818; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 16819; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 16820; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 16821; GFX940-TGSPLIT-NEXT: s_endpgm 16822; 16823; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16824; GFX11-WGP: ; %bb.0: ; %entry 16825; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16826; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16827; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16828; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 16829; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 16830; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 16831; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16832; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 16833; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16834; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16835; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16836; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 16837; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 16838; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16839; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 16840; GFX11-WGP-NEXT: s_endpgm 16841; 16842; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16843; GFX11-CU: ; %bb.0: ; %entry 16844; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16845; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16846; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16847; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 16848; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 16849; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 16850; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16851; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 16852; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16853; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16854; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 16855; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 16856; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 16857; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16858; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 16859; GFX11-CU-NEXT: s_endpgm 16860; 16861; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16862; GFX12-WGP: ; %bb.0: ; %entry 16863; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16864; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 16865; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 16866; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 16867; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 16868; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 16869; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16870; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 16871; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16872; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16873; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16874; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 16875; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 16876; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 16877; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 16878; GFX12-WGP-NEXT: s_endpgm 16879; 16880; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: 16881; GFX12-CU: ; %bb.0: ; %entry 16882; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 16883; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 16884; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 16885; GFX12-CU-NEXT: s_wait_kmcnt 0x0 16886; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 16887; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 16888; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16889; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 16890; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16891; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16892; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 16893; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 16894; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 16895; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 16896; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 16897; GFX12-CU-NEXT: s_endpgm 16898 ptr %out, i32 %in, i32 %old) { 16899entry: 16900 %gep = getelementptr i32, ptr %out, i32 4 16901 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic 16902 %val0 = extractvalue { i32, i1 } %val, 0 16903 store i32 %val0, ptr %out, align 4 16904 ret void 16905} 16906 16907define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( 16908; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 16909; GFX7: ; %bb.0: ; %entry 16910; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 16911; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16912; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 16913; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 16914; GFX7-NEXT: s_mov_b64 s[12:13], 16 16915; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16916; GFX7-NEXT: s_mov_b32 s6, s4 16917; GFX7-NEXT: s_mov_b32 s7, s5 16918; GFX7-NEXT: s_mov_b32 s11, s12 16919; GFX7-NEXT: s_mov_b32 s10, s13 16920; GFX7-NEXT: s_add_u32 s6, s6, s11 16921; GFX7-NEXT: s_addc_u32 s10, s7, s10 16922; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16923; GFX7-NEXT: s_mov_b32 s7, s10 16924; GFX7-NEXT: v_mov_b32_e32 v2, s9 16925; GFX7-NEXT: v_mov_b32_e32 v0, s8 16926; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16927; GFX7-NEXT: v_mov_b32_e32 v3, v0 16928; GFX7-NEXT: v_mov_b32_e32 v0, s6 16929; GFX7-NEXT: v_mov_b32_e32 v1, s7 16930; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16931; GFX7-NEXT: v_mov_b32_e32 v0, s4 16932; GFX7-NEXT: v_mov_b32_e32 v1, s5 16933; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16934; GFX7-NEXT: flat_store_dword v[0:1], v2 16935; GFX7-NEXT: s_endpgm 16936; 16937; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 16938; GFX10-WGP: ; %bb.0: ; %entry 16939; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 16940; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16941; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 16942; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 16943; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 16944; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 16945; GFX10-WGP-NEXT: s_mov_b32 s6, s4 16946; GFX10-WGP-NEXT: s_mov_b32 s7, s5 16947; GFX10-WGP-NEXT: s_mov_b32 s11, s12 16948; GFX10-WGP-NEXT: s_mov_b32 s10, s13 16949; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 16950; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 16951; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16952; GFX10-WGP-NEXT: s_mov_b32 s7, s10 16953; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 16954; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 16955; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16956; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 16957; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 16958; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 16959; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16960; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 16961; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 16962; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16963; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 16964; GFX10-WGP-NEXT: s_endpgm 16965; 16966; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 16967; GFX10-CU: ; %bb.0: ; %entry 16968; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 16969; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 16970; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 16971; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 16972; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 16973; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 16974; GFX10-CU-NEXT: s_mov_b32 s6, s4 16975; GFX10-CU-NEXT: s_mov_b32 s7, s5 16976; GFX10-CU-NEXT: s_mov_b32 s11, s12 16977; GFX10-CU-NEXT: s_mov_b32 s10, s13 16978; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 16979; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 16980; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 16981; GFX10-CU-NEXT: s_mov_b32 s7, s10 16982; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 16983; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 16984; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 16985; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 16986; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 16987; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 16988; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16989; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 16990; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 16991; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16992; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 16993; GFX10-CU-NEXT: s_endpgm 16994; 16995; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 16996; SKIP-CACHE-INV: ; %bb.0: ; %entry 16997; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 16998; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 16999; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17000; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17001; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17002; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17003; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17004; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17005; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17006; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17007; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17008; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17009; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17010; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17011; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17012; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17013; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17014; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17015; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17016; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17017; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17018; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17019; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17020; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17021; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17022; SKIP-CACHE-INV-NEXT: s_endpgm 17023; 17024; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17025; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17026; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17027; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17028; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17029; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17030; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17031; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17032; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17033; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17034; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17035; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17036; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17037; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17038; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17039; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17040; 17041; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17042; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17043; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17044; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17045; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17046; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17047; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17048; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17049; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17050; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17051; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17052; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17053; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17054; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17055; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17056; GFX90A-TGSPLIT-NEXT: s_endpgm 17057; 17058; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17059; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17060; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17061; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17062; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17063; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17064; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17065; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17066; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17067; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17068; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17069; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17070; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17071; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17072; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17073; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17074; 17075; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17076; GFX940-TGSPLIT: ; %bb.0: ; %entry 17077; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17078; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17079; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17080; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17081; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17082; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17083; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17084; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17085; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17086; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17087; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17088; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17089; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17090; GFX940-TGSPLIT-NEXT: s_endpgm 17091; 17092; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17093; GFX11-WGP: ; %bb.0: ; %entry 17094; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17095; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17096; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17097; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17098; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17099; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17100; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17101; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17102; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17103; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17104; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17105; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17106; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17107; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17108; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17109; GFX11-WGP-NEXT: s_endpgm 17110; 17111; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17112; GFX11-CU: ; %bb.0: ; %entry 17113; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17114; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17115; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17116; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17117; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17118; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17119; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17120; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17121; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17122; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17123; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17124; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17125; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17126; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17127; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17128; GFX11-CU-NEXT: s_endpgm 17129; 17130; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17131; GFX12-WGP: ; %bb.0: ; %entry 17132; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17133; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17134; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17135; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17136; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17137; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17138; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17139; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17140; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17141; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17142; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17143; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17144; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17145; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 17146; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17147; GFX12-WGP-NEXT: s_endpgm 17148; 17149; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: 17150; GFX12-CU: ; %bb.0: ; %entry 17151; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17152; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17153; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17154; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17155; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17156; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17157; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17158; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17159; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17160; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17161; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17162; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17163; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17164; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17165; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17166; GFX12-CU-NEXT: s_endpgm 17167 ptr %out, i32 %in, i32 %old) { 17168entry: 17169 %gep = getelementptr i32, ptr %out, i32 4 17170 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire 17171 %val0 = extractvalue { i32, i1 } %val, 0 17172 store i32 %val0, ptr %out, align 4 17173 ret void 17174} 17175 17176define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( 17177; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17178; GFX7: ; %bb.0: ; %entry 17179; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17180; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17181; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17182; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17183; GFX7-NEXT: s_mov_b64 s[12:13], 16 17184; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17185; GFX7-NEXT: s_mov_b32 s6, s4 17186; GFX7-NEXT: s_mov_b32 s7, s5 17187; GFX7-NEXT: s_mov_b32 s11, s12 17188; GFX7-NEXT: s_mov_b32 s10, s13 17189; GFX7-NEXT: s_add_u32 s6, s6, s11 17190; GFX7-NEXT: s_addc_u32 s10, s7, s10 17191; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17192; GFX7-NEXT: s_mov_b32 s7, s10 17193; GFX7-NEXT: v_mov_b32_e32 v2, s9 17194; GFX7-NEXT: v_mov_b32_e32 v0, s8 17195; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17196; GFX7-NEXT: v_mov_b32_e32 v3, v0 17197; GFX7-NEXT: v_mov_b32_e32 v0, s6 17198; GFX7-NEXT: v_mov_b32_e32 v1, s7 17199; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17200; GFX7-NEXT: v_mov_b32_e32 v0, s4 17201; GFX7-NEXT: v_mov_b32_e32 v1, s5 17202; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17203; GFX7-NEXT: flat_store_dword v[0:1], v2 17204; GFX7-NEXT: s_endpgm 17205; 17206; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17207; GFX10-WGP: ; %bb.0: ; %entry 17208; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17209; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17210; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17211; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17212; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17213; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17214; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17215; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17216; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17217; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17218; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17219; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17220; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17221; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17222; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17223; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17224; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17225; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17226; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17227; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17228; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17229; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17230; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17231; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17232; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17233; GFX10-WGP-NEXT: s_endpgm 17234; 17235; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17236; GFX10-CU: ; %bb.0: ; %entry 17237; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17238; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17239; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17240; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17241; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17242; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17243; GFX10-CU-NEXT: s_mov_b32 s6, s4 17244; GFX10-CU-NEXT: s_mov_b32 s7, s5 17245; GFX10-CU-NEXT: s_mov_b32 s11, s12 17246; GFX10-CU-NEXT: s_mov_b32 s10, s13 17247; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17248; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17249; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17250; GFX10-CU-NEXT: s_mov_b32 s7, s10 17251; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17252; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17253; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17254; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17255; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17256; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17257; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17258; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17259; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17260; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17261; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17262; GFX10-CU-NEXT: s_endpgm 17263; 17264; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17265; SKIP-CACHE-INV: ; %bb.0: ; %entry 17266; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17267; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17268; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17269; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17270; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17271; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17272; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17273; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17274; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17275; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17276; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17277; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17278; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17279; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17280; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17281; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17282; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17283; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17285; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17286; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17287; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17288; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17289; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17290; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17291; SKIP-CACHE-INV-NEXT: s_endpgm 17292; 17293; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17294; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17295; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17296; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17297; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17298; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17299; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17300; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17301; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17302; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17303; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17304; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17305; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17306; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17307; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17308; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17309; 17310; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17311; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17312; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17313; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17314; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17315; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17316; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17317; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17318; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17319; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17320; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17321; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17322; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17323; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17324; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17325; GFX90A-TGSPLIT-NEXT: s_endpgm 17326; 17327; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17328; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17329; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17330; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17331; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17332; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17333; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17334; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17335; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17336; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17337; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17338; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17339; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17340; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17341; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17342; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17343; 17344; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17345; GFX940-TGSPLIT: ; %bb.0: ; %entry 17346; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17347; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17348; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17349; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17350; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17351; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17352; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17353; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17354; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17355; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17356; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17357; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17358; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17359; GFX940-TGSPLIT-NEXT: s_endpgm 17360; 17361; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17362; GFX11-WGP: ; %bb.0: ; %entry 17363; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17364; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17365; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17366; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17367; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17368; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17369; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17370; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17371; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17372; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17373; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17374; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17375; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17376; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17377; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17378; GFX11-WGP-NEXT: s_endpgm 17379; 17380; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17381; GFX11-CU: ; %bb.0: ; %entry 17382; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17383; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17384; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17385; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17386; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17387; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17388; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17389; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17390; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17391; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17392; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17393; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17394; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17395; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17396; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17397; GFX11-CU-NEXT: s_endpgm 17398; 17399; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17400; GFX12-WGP: ; %bb.0: ; %entry 17401; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17402; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17403; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17404; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17405; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17406; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17407; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17408; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17409; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17410; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17411; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17412; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17413; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17414; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 17415; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17416; GFX12-WGP-NEXT: s_endpgm 17417; 17418; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: 17419; GFX12-CU: ; %bb.0: ; %entry 17420; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17421; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17422; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17423; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17424; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17425; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17426; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17427; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17428; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17429; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17430; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17431; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17432; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17433; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17434; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17435; GFX12-CU-NEXT: s_endpgm 17436 ptr %out, i32 %in, i32 %old) { 17437entry: 17438 %gep = getelementptr i32, ptr %out, i32 4 17439 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire 17440 %val0 = extractvalue { i32, i1 } %val, 0 17441 store i32 %val0, ptr %out, align 4 17442 ret void 17443} 17444 17445define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( 17446; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17447; GFX7: ; %bb.0: ; %entry 17448; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17449; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17450; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17451; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17452; GFX7-NEXT: s_mov_b64 s[12:13], 16 17453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17454; GFX7-NEXT: s_mov_b32 s6, s4 17455; GFX7-NEXT: s_mov_b32 s7, s5 17456; GFX7-NEXT: s_mov_b32 s11, s12 17457; GFX7-NEXT: s_mov_b32 s10, s13 17458; GFX7-NEXT: s_add_u32 s6, s6, s11 17459; GFX7-NEXT: s_addc_u32 s10, s7, s10 17460; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17461; GFX7-NEXT: s_mov_b32 s7, s10 17462; GFX7-NEXT: v_mov_b32_e32 v2, s9 17463; GFX7-NEXT: v_mov_b32_e32 v0, s8 17464; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17465; GFX7-NEXT: v_mov_b32_e32 v3, v0 17466; GFX7-NEXT: v_mov_b32_e32 v0, s6 17467; GFX7-NEXT: v_mov_b32_e32 v1, s7 17468; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17469; GFX7-NEXT: v_mov_b32_e32 v0, s4 17470; GFX7-NEXT: v_mov_b32_e32 v1, s5 17471; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17472; GFX7-NEXT: flat_store_dword v[0:1], v2 17473; GFX7-NEXT: s_endpgm 17474; 17475; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17476; GFX10-WGP: ; %bb.0: ; %entry 17477; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17478; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17479; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17480; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17481; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17482; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17483; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17484; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17485; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17486; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17487; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17488; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17489; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17490; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17491; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17492; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17493; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17494; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17495; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17496; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17497; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17498; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17499; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17500; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17501; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17502; GFX10-WGP-NEXT: s_endpgm 17503; 17504; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17505; GFX10-CU: ; %bb.0: ; %entry 17506; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17507; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17508; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17509; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17510; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17511; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17512; GFX10-CU-NEXT: s_mov_b32 s6, s4 17513; GFX10-CU-NEXT: s_mov_b32 s7, s5 17514; GFX10-CU-NEXT: s_mov_b32 s11, s12 17515; GFX10-CU-NEXT: s_mov_b32 s10, s13 17516; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17517; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17518; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17519; GFX10-CU-NEXT: s_mov_b32 s7, s10 17520; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17521; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17522; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17523; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17524; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17525; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17526; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17527; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17528; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17529; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17530; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17531; GFX10-CU-NEXT: s_endpgm 17532; 17533; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17534; SKIP-CACHE-INV: ; %bb.0: ; %entry 17535; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17536; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17537; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17538; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17539; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17540; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17541; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17542; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17543; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17544; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17545; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17546; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17547; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17548; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17549; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17550; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17551; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17552; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17553; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17554; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17555; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17556; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17557; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17558; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17559; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17560; SKIP-CACHE-INV-NEXT: s_endpgm 17561; 17562; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17563; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17564; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17565; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17566; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17567; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17568; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17569; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17570; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17571; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17572; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17573; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17574; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17575; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17576; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17577; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17578; 17579; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17580; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17581; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17582; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17583; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17584; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17585; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17586; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17587; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17588; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17589; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17590; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17591; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17592; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17593; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17594; GFX90A-TGSPLIT-NEXT: s_endpgm 17595; 17596; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17597; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17598; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17599; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17600; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17601; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17602; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17603; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17604; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17605; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17606; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17607; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17608; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17609; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17610; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17611; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17612; 17613; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17614; GFX940-TGSPLIT: ; %bb.0: ; %entry 17615; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17616; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17617; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17618; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17619; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17620; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17621; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17622; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17623; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17624; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17625; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17626; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17627; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17628; GFX940-TGSPLIT-NEXT: s_endpgm 17629; 17630; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17631; GFX11-WGP: ; %bb.0: ; %entry 17632; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17633; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17634; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17635; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17636; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17637; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17638; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17639; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17640; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17641; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17642; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17643; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17644; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17645; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17646; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17647; GFX11-WGP-NEXT: s_endpgm 17648; 17649; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17650; GFX11-CU: ; %bb.0: ; %entry 17651; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17652; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17653; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17654; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17655; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17656; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17657; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17658; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17659; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17660; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17661; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17662; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17663; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17664; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17665; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17666; GFX11-CU-NEXT: s_endpgm 17667; 17668; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17669; GFX12-WGP: ; %bb.0: ; %entry 17670; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17671; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17672; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17673; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17674; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17675; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17676; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17677; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17678; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17679; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17680; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17681; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17682; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17683; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 17684; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17685; GFX12-WGP-NEXT: s_endpgm 17686; 17687; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: 17688; GFX12-CU: ; %bb.0: ; %entry 17689; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17690; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17691; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17692; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17693; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17694; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17695; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17696; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17697; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17698; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17699; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17700; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17701; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17702; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17703; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17704; GFX12-CU-NEXT: s_endpgm 17705 ptr %out, i32 %in, i32 %old) { 17706entry: 17707 %gep = getelementptr i32, ptr %out, i32 4 17708 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire 17709 %val0 = extractvalue { i32, i1 } %val, 0 17710 store i32 %val0, ptr %out, align 4 17711 ret void 17712} 17713 17714define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( 17715; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17716; GFX7: ; %bb.0: ; %entry 17717; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17718; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17719; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17720; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17721; GFX7-NEXT: s_mov_b64 s[12:13], 16 17722; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17723; GFX7-NEXT: s_mov_b32 s6, s4 17724; GFX7-NEXT: s_mov_b32 s7, s5 17725; GFX7-NEXT: s_mov_b32 s11, s12 17726; GFX7-NEXT: s_mov_b32 s10, s13 17727; GFX7-NEXT: s_add_u32 s6, s6, s11 17728; GFX7-NEXT: s_addc_u32 s10, s7, s10 17729; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17730; GFX7-NEXT: s_mov_b32 s7, s10 17731; GFX7-NEXT: v_mov_b32_e32 v2, s9 17732; GFX7-NEXT: v_mov_b32_e32 v0, s8 17733; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17734; GFX7-NEXT: v_mov_b32_e32 v3, v0 17735; GFX7-NEXT: v_mov_b32_e32 v0, s6 17736; GFX7-NEXT: v_mov_b32_e32 v1, s7 17737; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17738; GFX7-NEXT: v_mov_b32_e32 v0, s4 17739; GFX7-NEXT: v_mov_b32_e32 v1, s5 17740; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17741; GFX7-NEXT: flat_store_dword v[0:1], v2 17742; GFX7-NEXT: s_endpgm 17743; 17744; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17745; GFX10-WGP: ; %bb.0: ; %entry 17746; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 17747; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17748; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 17749; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 17750; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 17751; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 17752; GFX10-WGP-NEXT: s_mov_b32 s6, s4 17753; GFX10-WGP-NEXT: s_mov_b32 s7, s5 17754; GFX10-WGP-NEXT: s_mov_b32 s11, s12 17755; GFX10-WGP-NEXT: s_mov_b32 s10, s13 17756; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 17757; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 17758; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17759; GFX10-WGP-NEXT: s_mov_b32 s7, s10 17760; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 17761; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 17762; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17763; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 17764; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 17765; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 17766; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17767; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 17768; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 17769; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17770; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 17771; GFX10-WGP-NEXT: s_endpgm 17772; 17773; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17774; GFX10-CU: ; %bb.0: ; %entry 17775; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 17776; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17777; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 17778; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 17779; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 17780; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 17781; GFX10-CU-NEXT: s_mov_b32 s6, s4 17782; GFX10-CU-NEXT: s_mov_b32 s7, s5 17783; GFX10-CU-NEXT: s_mov_b32 s11, s12 17784; GFX10-CU-NEXT: s_mov_b32 s10, s13 17785; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 17786; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 17787; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17788; GFX10-CU-NEXT: s_mov_b32 s7, s10 17789; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 17790; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 17791; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17792; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 17793; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 17794; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 17795; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17796; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 17797; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 17798; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17799; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 17800; GFX10-CU-NEXT: s_endpgm 17801; 17802; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17803; SKIP-CACHE-INV: ; %bb.0: ; %entry 17804; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 17805; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 17806; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 17807; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 17808; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 17809; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 17810; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 17811; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 17812; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 17813; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 17814; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 17815; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 17816; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 17817; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 17818; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 17819; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 17820; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17821; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 17822; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 17823; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 17824; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 17825; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 17826; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 17827; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17828; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 17829; SKIP-CACHE-INV-NEXT: s_endpgm 17830; 17831; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17832; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 17833; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17834; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17835; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17836; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17837; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17838; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17839; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17840; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17841; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17842; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17843; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17844; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17845; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 17846; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 17847; 17848; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17849; GFX90A-TGSPLIT: ; %bb.0: ; %entry 17850; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 17851; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 17852; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 17853; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17854; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 17855; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 17856; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17857; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17858; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17859; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 17860; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 17861; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17862; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 17863; GFX90A-TGSPLIT-NEXT: s_endpgm 17864; 17865; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17866; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 17867; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17868; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17869; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17870; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17871; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17872; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17873; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17874; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17875; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17876; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17877; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17878; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17879; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17880; GFX940-NOTTGSPLIT-NEXT: s_endpgm 17881; 17882; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17883; GFX940-TGSPLIT: ; %bb.0: ; %entry 17884; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 17885; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 17886; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 17887; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 17888; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 17889; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 17890; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17891; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 17892; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17893; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 17894; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 17895; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 17896; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 17897; GFX940-TGSPLIT-NEXT: s_endpgm 17898; 17899; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17900; GFX11-WGP: ; %bb.0: ; %entry 17901; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17902; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17903; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17904; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 17905; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 17906; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 17907; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17908; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 17909; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17910; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17911; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17912; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 17913; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 17914; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17915; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 17916; GFX11-WGP-NEXT: s_endpgm 17917; 17918; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17919; GFX11-CU: ; %bb.0: ; %entry 17920; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17921; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17922; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17923; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 17924; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 17925; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 17926; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17927; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 17928; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17929; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17930; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 17931; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 17932; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 17933; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 17934; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 17935; GFX11-CU-NEXT: s_endpgm 17936; 17937; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17938; GFX12-WGP: ; %bb.0: ; %entry 17939; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17940; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 17941; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 17942; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 17943; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 17944; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 17945; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17946; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 17947; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17948; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17949; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17950; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 17951; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 17952; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 17953; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 17954; GFX12-WGP-NEXT: s_endpgm 17955; 17956; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: 17957; GFX12-CU: ; %bb.0: ; %entry 17958; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 17959; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 17960; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 17961; GFX12-CU-NEXT: s_wait_kmcnt 0x0 17962; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 17963; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 17964; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 17965; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 17966; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17967; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17968; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 17969; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 17970; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 17971; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 17972; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 17973; GFX12-CU-NEXT: s_endpgm 17974 ptr %out, i32 %in, i32 %old) { 17975entry: 17976 %gep = getelementptr i32, ptr %out, i32 4 17977 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire 17978 %val0 = extractvalue { i32, i1 } %val, 0 17979 store i32 %val0, ptr %out, align 4 17980 ret void 17981} 17982 17983define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( 17984; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 17985; GFX7: ; %bb.0: ; %entry 17986; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 17987; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 17988; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 17989; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 17990; GFX7-NEXT: s_mov_b64 s[12:13], 16 17991; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17992; GFX7-NEXT: s_mov_b32 s6, s4 17993; GFX7-NEXT: s_mov_b32 s7, s5 17994; GFX7-NEXT: s_mov_b32 s11, s12 17995; GFX7-NEXT: s_mov_b32 s10, s13 17996; GFX7-NEXT: s_add_u32 s6, s6, s11 17997; GFX7-NEXT: s_addc_u32 s10, s7, s10 17998; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 17999; GFX7-NEXT: s_mov_b32 s7, s10 18000; GFX7-NEXT: v_mov_b32_e32 v2, s9 18001; GFX7-NEXT: v_mov_b32_e32 v0, s8 18002; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18003; GFX7-NEXT: v_mov_b32_e32 v3, v0 18004; GFX7-NEXT: v_mov_b32_e32 v0, s6 18005; GFX7-NEXT: v_mov_b32_e32 v1, s7 18006; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18007; GFX7-NEXT: v_mov_b32_e32 v0, s4 18008; GFX7-NEXT: v_mov_b32_e32 v1, s5 18009; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18010; GFX7-NEXT: flat_store_dword v[0:1], v2 18011; GFX7-NEXT: s_endpgm 18012; 18013; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18014; GFX10-WGP: ; %bb.0: ; %entry 18015; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18016; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18017; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18018; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18019; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18020; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18021; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18022; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18023; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18024; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18025; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18026; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18027; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18028; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18029; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18030; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18031; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18032; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18033; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18034; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18035; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18036; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18037; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18038; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18039; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18040; GFX10-WGP-NEXT: s_endpgm 18041; 18042; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18043; GFX10-CU: ; %bb.0: ; %entry 18044; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18045; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18046; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18047; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18048; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18049; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18050; GFX10-CU-NEXT: s_mov_b32 s6, s4 18051; GFX10-CU-NEXT: s_mov_b32 s7, s5 18052; GFX10-CU-NEXT: s_mov_b32 s11, s12 18053; GFX10-CU-NEXT: s_mov_b32 s10, s13 18054; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18055; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18056; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18057; GFX10-CU-NEXT: s_mov_b32 s7, s10 18058; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18059; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18060; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18061; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18062; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18063; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18064; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18065; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18066; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18067; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18068; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18069; GFX10-CU-NEXT: s_endpgm 18070; 18071; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18072; SKIP-CACHE-INV: ; %bb.0: ; %entry 18073; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18074; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18075; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18076; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18077; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18078; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18079; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18080; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18081; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18082; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18083; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18084; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18085; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18086; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18087; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18088; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18089; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18091; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18092; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18093; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18094; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18095; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18096; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18097; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18098; SKIP-CACHE-INV-NEXT: s_endpgm 18099; 18100; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18101; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18102; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18103; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18104; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18105; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18106; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18107; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18108; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18109; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18110; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18111; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18112; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18113; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18114; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18115; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18116; 18117; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18118; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18119; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18120; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18121; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18122; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18123; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18124; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18125; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18126; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18127; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18128; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18129; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18130; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18131; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18132; GFX90A-TGSPLIT-NEXT: s_endpgm 18133; 18134; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18135; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18136; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18137; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18138; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18139; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18140; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18141; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18142; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18143; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18144; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18145; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18146; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18147; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18148; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18149; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18150; 18151; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18152; GFX940-TGSPLIT: ; %bb.0: ; %entry 18153; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18154; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18155; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18156; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18157; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18158; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18159; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18160; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18161; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18162; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18163; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18164; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18165; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18166; GFX940-TGSPLIT-NEXT: s_endpgm 18167; 18168; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18169; GFX11-WGP: ; %bb.0: ; %entry 18170; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18171; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18172; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18173; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18174; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18175; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18176; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18177; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18178; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18179; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18180; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18181; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18182; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18183; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18184; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18185; GFX11-WGP-NEXT: s_endpgm 18186; 18187; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18188; GFX11-CU: ; %bb.0: ; %entry 18189; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18190; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18191; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18192; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18193; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18194; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18195; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18196; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18197; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18198; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18199; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18200; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18201; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18202; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18203; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18204; GFX11-CU-NEXT: s_endpgm 18205; 18206; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18207; GFX12-WGP: ; %bb.0: ; %entry 18208; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18209; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18210; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18211; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18212; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18213; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18214; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18215; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18216; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18217; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18218; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18219; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18220; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18221; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 18222; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18223; GFX12-WGP-NEXT: s_endpgm 18224; 18225; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: 18226; GFX12-CU: ; %bb.0: ; %entry 18227; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18228; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18229; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18230; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18231; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18232; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18233; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18234; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18235; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18236; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18237; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18238; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18239; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18240; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18241; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18242; GFX12-CU-NEXT: s_endpgm 18243 ptr %out, i32 %in, i32 %old) { 18244entry: 18245 %gep = getelementptr i32, ptr %out, i32 4 18246 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire 18247 %val0 = extractvalue { i32, i1 } %val, 0 18248 store i32 %val0, ptr %out, align 4 18249 ret void 18250} 18251 18252define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( 18253; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18254; GFX7: ; %bb.0: ; %entry 18255; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18256; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18257; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18258; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18259; GFX7-NEXT: s_mov_b64 s[12:13], 16 18260; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18261; GFX7-NEXT: s_mov_b32 s6, s4 18262; GFX7-NEXT: s_mov_b32 s7, s5 18263; GFX7-NEXT: s_mov_b32 s11, s12 18264; GFX7-NEXT: s_mov_b32 s10, s13 18265; GFX7-NEXT: s_add_u32 s6, s6, s11 18266; GFX7-NEXT: s_addc_u32 s10, s7, s10 18267; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18268; GFX7-NEXT: s_mov_b32 s7, s10 18269; GFX7-NEXT: v_mov_b32_e32 v2, s9 18270; GFX7-NEXT: v_mov_b32_e32 v0, s8 18271; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18272; GFX7-NEXT: v_mov_b32_e32 v3, v0 18273; GFX7-NEXT: v_mov_b32_e32 v0, s6 18274; GFX7-NEXT: v_mov_b32_e32 v1, s7 18275; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18276; GFX7-NEXT: v_mov_b32_e32 v0, s4 18277; GFX7-NEXT: v_mov_b32_e32 v1, s5 18278; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18279; GFX7-NEXT: flat_store_dword v[0:1], v2 18280; GFX7-NEXT: s_endpgm 18281; 18282; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18283; GFX10-WGP: ; %bb.0: ; %entry 18284; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18285; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18286; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18287; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18288; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18289; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18290; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18291; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18292; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18293; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18294; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18295; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18296; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18297; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18298; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18299; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18300; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18301; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18302; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18303; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18304; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18305; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18306; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18307; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18308; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18309; GFX10-WGP-NEXT: s_endpgm 18310; 18311; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18312; GFX10-CU: ; %bb.0: ; %entry 18313; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18314; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18315; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18316; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18317; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18318; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18319; GFX10-CU-NEXT: s_mov_b32 s6, s4 18320; GFX10-CU-NEXT: s_mov_b32 s7, s5 18321; GFX10-CU-NEXT: s_mov_b32 s11, s12 18322; GFX10-CU-NEXT: s_mov_b32 s10, s13 18323; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18324; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18325; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18326; GFX10-CU-NEXT: s_mov_b32 s7, s10 18327; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18328; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18329; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18330; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18331; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18332; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18333; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18334; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18335; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18336; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18337; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18338; GFX10-CU-NEXT: s_endpgm 18339; 18340; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18341; SKIP-CACHE-INV: ; %bb.0: ; %entry 18342; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18343; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18344; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18345; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18346; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18347; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18348; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18349; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18350; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18351; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18352; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18353; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18354; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18355; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18357; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18358; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18359; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18360; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18361; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18362; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18363; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18364; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18365; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18366; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18367; SKIP-CACHE-INV-NEXT: s_endpgm 18368; 18369; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18370; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18371; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18372; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18373; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18374; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18375; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18376; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18377; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18378; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18379; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18380; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18381; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18382; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18383; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18384; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18385; 18386; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18387; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18388; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18389; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18390; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18391; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18392; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18393; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18394; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18395; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18396; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18397; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18398; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18399; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18400; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18401; GFX90A-TGSPLIT-NEXT: s_endpgm 18402; 18403; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18404; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18405; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18406; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18407; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18408; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18409; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18410; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18411; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18412; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18413; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18414; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18415; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18416; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18417; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18418; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18419; 18420; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18421; GFX940-TGSPLIT: ; %bb.0: ; %entry 18422; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18423; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18424; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18425; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18426; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18427; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18428; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18429; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18430; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18431; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18432; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18433; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18434; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18435; GFX940-TGSPLIT-NEXT: s_endpgm 18436; 18437; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18438; GFX11-WGP: ; %bb.0: ; %entry 18439; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18440; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18441; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18442; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18443; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18444; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18445; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18446; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18447; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18448; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18449; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18450; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18451; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18452; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18453; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18454; GFX11-WGP-NEXT: s_endpgm 18455; 18456; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18457; GFX11-CU: ; %bb.0: ; %entry 18458; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18459; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18460; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18461; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18462; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18463; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18464; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18465; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18466; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18467; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18468; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18469; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18470; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18471; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18472; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18473; GFX11-CU-NEXT: s_endpgm 18474; 18475; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18476; GFX12-WGP: ; %bb.0: ; %entry 18477; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18478; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18479; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18480; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18481; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18482; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18483; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18484; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18485; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18486; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18487; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18488; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18489; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18490; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 18491; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18492; GFX12-WGP-NEXT: s_endpgm 18493; 18494; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: 18495; GFX12-CU: ; %bb.0: ; %entry 18496; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18497; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18498; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18499; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18500; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18501; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18502; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18503; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18504; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18505; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18506; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18507; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18508; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18509; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18510; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18511; GFX12-CU-NEXT: s_endpgm 18512 ptr %out, i32 %in, i32 %old) { 18513entry: 18514 %gep = getelementptr i32, ptr %out, i32 4 18515 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst 18516 %val0 = extractvalue { i32, i1 } %val, 0 18517 store i32 %val0, ptr %out, align 4 18518 ret void 18519} 18520 18521define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( 18522; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18523; GFX7: ; %bb.0: ; %entry 18524; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18525; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18526; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18527; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18528; GFX7-NEXT: s_mov_b64 s[12:13], 16 18529; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18530; GFX7-NEXT: s_mov_b32 s6, s4 18531; GFX7-NEXT: s_mov_b32 s7, s5 18532; GFX7-NEXT: s_mov_b32 s11, s12 18533; GFX7-NEXT: s_mov_b32 s10, s13 18534; GFX7-NEXT: s_add_u32 s6, s6, s11 18535; GFX7-NEXT: s_addc_u32 s10, s7, s10 18536; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18537; GFX7-NEXT: s_mov_b32 s7, s10 18538; GFX7-NEXT: v_mov_b32_e32 v2, s9 18539; GFX7-NEXT: v_mov_b32_e32 v0, s8 18540; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18541; GFX7-NEXT: v_mov_b32_e32 v3, v0 18542; GFX7-NEXT: v_mov_b32_e32 v0, s6 18543; GFX7-NEXT: v_mov_b32_e32 v1, s7 18544; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18545; GFX7-NEXT: v_mov_b32_e32 v0, s4 18546; GFX7-NEXT: v_mov_b32_e32 v1, s5 18547; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18548; GFX7-NEXT: flat_store_dword v[0:1], v2 18549; GFX7-NEXT: s_endpgm 18550; 18551; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18552; GFX10-WGP: ; %bb.0: ; %entry 18553; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18554; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18555; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18556; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18557; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18558; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18559; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18560; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18561; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18562; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18563; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18564; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18565; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18566; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18567; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18568; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18569; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18570; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18571; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18572; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18573; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18574; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18575; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18576; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18577; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18578; GFX10-WGP-NEXT: s_endpgm 18579; 18580; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18581; GFX10-CU: ; %bb.0: ; %entry 18582; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18583; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18584; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18585; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18586; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18587; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18588; GFX10-CU-NEXT: s_mov_b32 s6, s4 18589; GFX10-CU-NEXT: s_mov_b32 s7, s5 18590; GFX10-CU-NEXT: s_mov_b32 s11, s12 18591; GFX10-CU-NEXT: s_mov_b32 s10, s13 18592; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18593; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18594; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18595; GFX10-CU-NEXT: s_mov_b32 s7, s10 18596; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18597; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18598; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18599; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18600; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18601; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18602; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18603; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18604; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18605; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18606; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18607; GFX10-CU-NEXT: s_endpgm 18608; 18609; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18610; SKIP-CACHE-INV: ; %bb.0: ; %entry 18611; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18612; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18613; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18614; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18615; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18616; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18617; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18618; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18619; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18620; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18621; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18622; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18623; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18624; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18625; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18626; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18627; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18628; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18629; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18630; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18631; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18632; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18633; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18634; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18635; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18636; SKIP-CACHE-INV-NEXT: s_endpgm 18637; 18638; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18639; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18640; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18641; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18642; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18643; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18644; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18645; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18646; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18647; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18648; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18649; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18650; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18651; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18652; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18653; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18654; 18655; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18656; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18657; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18658; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18659; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18660; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18661; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18662; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18663; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18664; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18665; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18666; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18667; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18668; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18669; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18670; GFX90A-TGSPLIT-NEXT: s_endpgm 18671; 18672; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18673; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18674; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18675; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18676; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18677; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18678; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18679; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18680; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18681; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18682; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18683; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18684; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18685; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18686; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18687; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18688; 18689; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18690; GFX940-TGSPLIT: ; %bb.0: ; %entry 18691; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18692; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18693; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18694; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18695; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18696; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18697; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18698; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18699; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18700; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18701; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18702; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18703; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18704; GFX940-TGSPLIT-NEXT: s_endpgm 18705; 18706; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18707; GFX11-WGP: ; %bb.0: ; %entry 18708; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18709; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18710; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18711; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18712; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18713; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18714; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18715; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18716; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18717; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18718; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18719; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18720; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18721; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18722; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18723; GFX11-WGP-NEXT: s_endpgm 18724; 18725; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18726; GFX11-CU: ; %bb.0: ; %entry 18727; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18728; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18729; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18730; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 18731; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 18732; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 18733; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18734; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 18735; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18736; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18737; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18738; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 18739; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 18740; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18741; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 18742; GFX11-CU-NEXT: s_endpgm 18743; 18744; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18745; GFX12-WGP: ; %bb.0: ; %entry 18746; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18747; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18748; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18749; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 18750; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 18751; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 18752; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18753; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 18754; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18755; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18756; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18757; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 18758; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 18759; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 18760; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 18761; GFX12-WGP-NEXT: s_endpgm 18762; 18763; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: 18764; GFX12-CU: ; %bb.0: ; %entry 18765; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18766; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18767; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18768; GFX12-CU-NEXT: s_wait_kmcnt 0x0 18769; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 18770; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 18771; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18772; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 18773; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18774; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18775; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 18776; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 18777; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 18778; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 18779; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 18780; GFX12-CU-NEXT: s_endpgm 18781 ptr %out, i32 %in, i32 %old) { 18782entry: 18783 %gep = getelementptr i32, ptr %out, i32 4 18784 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst 18785 %val0 = extractvalue { i32, i1 } %val, 0 18786 store i32 %val0, ptr %out, align 4 18787 ret void 18788} 18789 18790define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( 18791; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18792; GFX7: ; %bb.0: ; %entry 18793; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 18794; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18795; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 18796; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 18797; GFX7-NEXT: s_mov_b64 s[12:13], 16 18798; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18799; GFX7-NEXT: s_mov_b32 s6, s4 18800; GFX7-NEXT: s_mov_b32 s7, s5 18801; GFX7-NEXT: s_mov_b32 s11, s12 18802; GFX7-NEXT: s_mov_b32 s10, s13 18803; GFX7-NEXT: s_add_u32 s6, s6, s11 18804; GFX7-NEXT: s_addc_u32 s10, s7, s10 18805; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18806; GFX7-NEXT: s_mov_b32 s7, s10 18807; GFX7-NEXT: v_mov_b32_e32 v2, s9 18808; GFX7-NEXT: v_mov_b32_e32 v0, s8 18809; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18810; GFX7-NEXT: v_mov_b32_e32 v3, v0 18811; GFX7-NEXT: v_mov_b32_e32 v0, s6 18812; GFX7-NEXT: v_mov_b32_e32 v1, s7 18813; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18814; GFX7-NEXT: v_mov_b32_e32 v0, s4 18815; GFX7-NEXT: v_mov_b32_e32 v1, s5 18816; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18817; GFX7-NEXT: flat_store_dword v[0:1], v2 18818; GFX7-NEXT: s_endpgm 18819; 18820; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18821; GFX10-WGP: ; %bb.0: ; %entry 18822; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 18823; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18824; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 18825; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 18826; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 18827; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 18828; GFX10-WGP-NEXT: s_mov_b32 s6, s4 18829; GFX10-WGP-NEXT: s_mov_b32 s7, s5 18830; GFX10-WGP-NEXT: s_mov_b32 s11, s12 18831; GFX10-WGP-NEXT: s_mov_b32 s10, s13 18832; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 18833; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 18834; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18835; GFX10-WGP-NEXT: s_mov_b32 s7, s10 18836; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 18837; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 18838; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18839; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 18840; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 18841; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 18842; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18843; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 18844; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 18845; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18846; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 18847; GFX10-WGP-NEXT: s_endpgm 18848; 18849; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18850; GFX10-CU: ; %bb.0: ; %entry 18851; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 18852; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 18853; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 18854; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 18855; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 18856; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 18857; GFX10-CU-NEXT: s_mov_b32 s6, s4 18858; GFX10-CU-NEXT: s_mov_b32 s7, s5 18859; GFX10-CU-NEXT: s_mov_b32 s11, s12 18860; GFX10-CU-NEXT: s_mov_b32 s10, s13 18861; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 18862; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 18863; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 18864; GFX10-CU-NEXT: s_mov_b32 s7, s10 18865; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 18866; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 18867; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18868; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 18869; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 18870; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 18871; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18872; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 18873; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 18874; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18875; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 18876; GFX10-CU-NEXT: s_endpgm 18877; 18878; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18879; SKIP-CACHE-INV: ; %bb.0: ; %entry 18880; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 18881; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 18882; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 18883; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 18884; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 18885; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 18886; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 18887; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 18888; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 18889; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 18890; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 18891; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 18892; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 18893; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 18894; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 18895; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 18896; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18897; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 18898; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 18899; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 18900; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 18901; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 18902; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 18903; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18904; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 18905; SKIP-CACHE-INV-NEXT: s_endpgm 18906; 18907; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18908; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 18909; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18910; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18911; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18912; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18913; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18914; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18915; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18916; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18917; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18918; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18919; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18920; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18921; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 18922; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 18923; 18924; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18925; GFX90A-TGSPLIT: ; %bb.0: ; %entry 18926; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 18927; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 18928; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 18929; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18930; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 18931; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 18932; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18933; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18934; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18935; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 18936; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 18937; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18938; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 18939; GFX90A-TGSPLIT-NEXT: s_endpgm 18940; 18941; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18942; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 18943; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18944; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18945; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18946; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18947; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18948; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18949; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18950; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18951; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18952; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18953; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18954; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18955; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18956; GFX940-NOTTGSPLIT-NEXT: s_endpgm 18957; 18958; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18959; GFX940-TGSPLIT: ; %bb.0: ; %entry 18960; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 18961; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 18962; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 18963; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 18964; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 18965; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 18966; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18967; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 18968; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18969; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 18970; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 18971; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 18972; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 18973; GFX940-TGSPLIT-NEXT: s_endpgm 18974; 18975; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18976; GFX11-WGP: ; %bb.0: ; %entry 18977; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18978; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 18979; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 18980; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 18981; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 18982; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 18983; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 18984; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 18985; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18986; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18987; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 18988; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 18989; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 18990; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18991; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 18992; GFX11-WGP-NEXT: s_endpgm 18993; 18994; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 18995; GFX11-CU: ; %bb.0: ; %entry 18996; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 18997; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 18998; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 18999; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19000; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19001; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19002; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19003; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19004; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19005; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19006; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19007; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19008; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19009; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19010; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19011; GFX11-CU-NEXT: s_endpgm 19012; 19013; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 19014; GFX12-WGP: ; %bb.0: ; %entry 19015; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19016; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19017; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19018; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19019; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19020; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19021; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19022; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19023; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19024; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19025; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19026; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19027; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19028; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 19029; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19030; GFX12-WGP-NEXT: s_endpgm 19031; 19032; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: 19033; GFX12-CU: ; %bb.0: ; %entry 19034; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19035; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19036; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19037; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19038; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19039; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19040; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19041; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19042; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19043; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19044; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19045; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19046; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19047; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19048; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19049; GFX12-CU-NEXT: s_endpgm 19050 ptr %out, i32 %in, i32 %old) { 19051entry: 19052 %gep = getelementptr i32, ptr %out, i32 4 19053 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst 19054 %val0 = extractvalue { i32, i1 } %val, 0 19055 store i32 %val0, ptr %out, align 4 19056 ret void 19057} 19058 19059define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( 19060; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19061; GFX7: ; %bb.0: ; %entry 19062; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19063; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19064; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19065; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19066; GFX7-NEXT: s_mov_b64 s[12:13], 16 19067; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19068; GFX7-NEXT: s_mov_b32 s6, s4 19069; GFX7-NEXT: s_mov_b32 s7, s5 19070; GFX7-NEXT: s_mov_b32 s11, s12 19071; GFX7-NEXT: s_mov_b32 s10, s13 19072; GFX7-NEXT: s_add_u32 s6, s6, s11 19073; GFX7-NEXT: s_addc_u32 s10, s7, s10 19074; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19075; GFX7-NEXT: s_mov_b32 s7, s10 19076; GFX7-NEXT: v_mov_b32_e32 v2, s9 19077; GFX7-NEXT: v_mov_b32_e32 v0, s8 19078; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19079; GFX7-NEXT: v_mov_b32_e32 v3, v0 19080; GFX7-NEXT: v_mov_b32_e32 v0, s6 19081; GFX7-NEXT: v_mov_b32_e32 v1, s7 19082; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19083; GFX7-NEXT: v_mov_b32_e32 v0, s4 19084; GFX7-NEXT: v_mov_b32_e32 v1, s5 19085; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19086; GFX7-NEXT: flat_store_dword v[0:1], v2 19087; GFX7-NEXT: s_endpgm 19088; 19089; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19090; GFX10-WGP: ; %bb.0: ; %entry 19091; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19092; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19093; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19094; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19095; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19096; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19097; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19098; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19099; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19100; GFX10-WGP-NEXT: s_mov_b32 s10, s13 19101; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 19102; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 19103; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19104; GFX10-WGP-NEXT: s_mov_b32 s7, s10 19105; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 19106; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 19107; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19108; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 19109; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 19110; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 19111; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19112; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 19113; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 19114; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19115; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 19116; GFX10-WGP-NEXT: s_endpgm 19117; 19118; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19119; GFX10-CU: ; %bb.0: ; %entry 19120; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 19121; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19122; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 19123; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 19124; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 19125; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 19126; GFX10-CU-NEXT: s_mov_b32 s6, s4 19127; GFX10-CU-NEXT: s_mov_b32 s7, s5 19128; GFX10-CU-NEXT: s_mov_b32 s11, s12 19129; GFX10-CU-NEXT: s_mov_b32 s10, s13 19130; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 19131; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 19132; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19133; GFX10-CU-NEXT: s_mov_b32 s7, s10 19134; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 19135; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 19136; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19137; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 19138; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 19139; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 19140; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19141; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 19142; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 19143; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19144; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 19145; GFX10-CU-NEXT: s_endpgm 19146; 19147; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19148; SKIP-CACHE-INV: ; %bb.0: ; %entry 19149; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 19150; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 19151; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 19152; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 19153; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 19154; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 19155; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 19156; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 19157; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 19158; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 19159; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 19160; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 19161; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 19162; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 19163; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 19164; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 19165; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19166; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 19167; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 19168; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 19169; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19170; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 19171; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 19172; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19173; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 19174; SKIP-CACHE-INV-NEXT: s_endpgm 19175; 19176; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19177; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 19178; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19179; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19180; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19181; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19182; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19183; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19184; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19185; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19186; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19187; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19188; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19189; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19190; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 19191; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 19192; 19193; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19194; GFX90A-TGSPLIT: ; %bb.0: ; %entry 19195; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19196; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19197; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19198; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19199; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19200; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19201; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19202; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19203; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19204; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19205; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19206; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19207; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 19208; GFX90A-TGSPLIT-NEXT: s_endpgm 19209; 19210; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19211; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 19212; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19213; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19214; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19215; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19216; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19217; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19218; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19219; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19220; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19221; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19222; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19223; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19224; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19225; GFX940-NOTTGSPLIT-NEXT: s_endpgm 19226; 19227; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19228; GFX940-TGSPLIT: ; %bb.0: ; %entry 19229; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19230; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19231; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19232; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19233; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19234; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19235; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19236; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19237; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19238; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19239; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19240; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19241; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19242; GFX940-TGSPLIT-NEXT: s_endpgm 19243; 19244; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19245; GFX11-WGP: ; %bb.0: ; %entry 19246; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19247; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19248; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19249; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19250; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19251; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19252; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19253; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19254; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19255; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19256; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19257; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19258; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19259; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19260; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19261; GFX11-WGP-NEXT: s_endpgm 19262; 19263; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19264; GFX11-CU: ; %bb.0: ; %entry 19265; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19266; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19267; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19268; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19269; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19270; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19271; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19272; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19273; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19274; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19275; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19276; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19277; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19278; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19279; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19280; GFX11-CU-NEXT: s_endpgm 19281; 19282; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19283; GFX12-WGP: ; %bb.0: ; %entry 19284; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19285; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19286; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19287; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19288; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19289; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19290; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19291; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19292; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19293; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19294; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19295; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19296; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19297; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 19298; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19299; GFX12-WGP-NEXT: s_endpgm 19300; 19301; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: 19302; GFX12-CU: ; %bb.0: ; %entry 19303; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19304; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19305; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19306; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19307; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19308; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19309; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19310; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19311; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19312; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19313; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19314; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19315; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19316; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19317; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19318; GFX12-CU-NEXT: s_endpgm 19319 ptr %out, i32 %in, i32 %old) { 19320entry: 19321 %gep = getelementptr i32, ptr %out, i32 4 19322 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst 19323 %val0 = extractvalue { i32, i1 } %val, 0 19324 store i32 %val0, ptr %out, align 4 19325 ret void 19326} 19327 19328define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( 19329; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19330; GFX7: ; %bb.0: ; %entry 19331; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] 19332; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19333; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 19334; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 19335; GFX7-NEXT: s_mov_b64 s[12:13], 16 19336; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19337; GFX7-NEXT: s_mov_b32 s6, s4 19338; GFX7-NEXT: s_mov_b32 s7, s5 19339; GFX7-NEXT: s_mov_b32 s11, s12 19340; GFX7-NEXT: s_mov_b32 s10, s13 19341; GFX7-NEXT: s_add_u32 s6, s6, s11 19342; GFX7-NEXT: s_addc_u32 s10, s7, s10 19343; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19344; GFX7-NEXT: s_mov_b32 s7, s10 19345; GFX7-NEXT: v_mov_b32_e32 v2, s9 19346; GFX7-NEXT: v_mov_b32_e32 v0, s8 19347; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19348; GFX7-NEXT: v_mov_b32_e32 v3, v0 19349; GFX7-NEXT: v_mov_b32_e32 v0, s6 19350; GFX7-NEXT: v_mov_b32_e32 v1, s7 19351; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19352; GFX7-NEXT: v_mov_b32_e32 v0, s4 19353; GFX7-NEXT: v_mov_b32_e32 v1, s5 19354; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19355; GFX7-NEXT: flat_store_dword v[0:1], v2 19356; GFX7-NEXT: s_endpgm 19357; 19358; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19359; GFX10-WGP: ; %bb.0: ; %entry 19360; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] 19361; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19362; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 19363; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc 19364; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 19365; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 19366; GFX10-WGP-NEXT: s_mov_b32 s6, s4 19367; GFX10-WGP-NEXT: s_mov_b32 s7, s5 19368; GFX10-WGP-NEXT: s_mov_b32 s11, s12 19369; GFX10-WGP-NEXT: s_mov_b32 s10, s13 19370; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 19371; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 19372; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19373; GFX10-WGP-NEXT: s_mov_b32 s7, s10 19374; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 19375; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 19376; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19377; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 19378; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 19379; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 19380; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19381; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 19382; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 19383; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19384; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 19385; GFX10-WGP-NEXT: s_endpgm 19386; 19387; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19388; GFX10-CU: ; %bb.0: ; %entry 19389; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] 19390; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 19391; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 19392; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc 19393; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 19394; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 19395; GFX10-CU-NEXT: s_mov_b32 s6, s4 19396; GFX10-CU-NEXT: s_mov_b32 s7, s5 19397; GFX10-CU-NEXT: s_mov_b32 s11, s12 19398; GFX10-CU-NEXT: s_mov_b32 s10, s13 19399; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 19400; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 19401; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 19402; GFX10-CU-NEXT: s_mov_b32 s7, s10 19403; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 19404; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 19405; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19406; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 19407; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 19408; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 19409; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19410; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 19411; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 19412; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19413; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 19414; GFX10-CU-NEXT: s_endpgm 19415; 19416; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19417; SKIP-CACHE-INV: ; %bb.0: ; %entry 19418; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] 19419; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 19420; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 19421; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 19422; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 19423; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 19424; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 19425; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 19426; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 19427; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 19428; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 19429; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 19430; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 19431; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 19432; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 19433; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 19434; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 19436; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 19437; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 19438; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 19439; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 19440; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 19441; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19442; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 19443; SKIP-CACHE-INV-NEXT: s_endpgm 19444; 19445; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19446; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 19447; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19448; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19449; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19450; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19451; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19452; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19453; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19454; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19455; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19456; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19457; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19458; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19459; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 19460; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 19461; 19462; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19463; GFX90A-TGSPLIT: ; %bb.0: ; %entry 19464; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 19465; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 19466; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc 19467; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19468; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 19469; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 19470; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19471; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19472; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19473; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc 19474; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 19475; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19476; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 19477; GFX90A-TGSPLIT-NEXT: s_endpgm 19478; 19479; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19480; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 19481; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19482; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19483; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19484; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19485; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19486; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19487; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19488; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19489; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19490; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19491; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19492; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19493; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19494; GFX940-NOTTGSPLIT-NEXT: s_endpgm 19495; 19496; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19497; GFX940-TGSPLIT: ; %bb.0: ; %entry 19498; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19499; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 19500; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc 19501; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 19502; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 19503; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 19504; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19505; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 19506; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19507; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 19508; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 19509; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 19510; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 19511; GFX940-TGSPLIT-NEXT: s_endpgm 19512; 19513; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19514; GFX11-WGP: ; %bb.0: ; %entry 19515; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19516; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19517; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19518; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 19519; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 19520; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 19521; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19522; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 19523; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19524; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19525; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19526; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 19527; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 19528; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19529; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 19530; GFX11-WGP-NEXT: s_endpgm 19531; 19532; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19533; GFX11-CU: ; %bb.0: ; %entry 19534; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19535; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19536; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19537; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 19538; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 19539; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 19540; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19541; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 19542; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19543; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19544; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc 19545; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 19546; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 19547; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 19548; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 19549; GFX11-CU-NEXT: s_endpgm 19550; 19551; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19552; GFX12-WGP: ; %bb.0: ; %entry 19553; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19554; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 19555; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc 19556; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 19557; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 19558; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 19559; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19560; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 19561; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19562; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19563; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19564; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 19565; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 19566; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 19567; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 19568; GFX12-WGP-NEXT: s_endpgm 19569; 19570; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: 19571; GFX12-CU: ; %bb.0: ; %entry 19572; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 19573; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 19574; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc 19575; GFX12-CU-NEXT: s_wait_kmcnt 0x0 19576; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 19577; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 19578; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 19579; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 19580; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19581; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19582; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN 19583; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 19584; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 19585; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 19586; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 19587; GFX12-CU-NEXT: s_endpgm 19588 ptr %out, i32 %in, i32 %old) { 19589entry: 19590 %gep = getelementptr i32, ptr %out, i32 4 19591 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst 19592 %val0 = extractvalue { i32, i1 } %val, 0 19593 store i32 %val0, ptr %out, align 4 19594 ret void 19595} 19596