xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
14
15define amdgpu_kernel void @flat_workgroup_unordered_load(
16; GFX7-LABEL: flat_workgroup_unordered_load:
17; GFX7:       ; %bb.0: ; %entry
18; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
19; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22; GFX7-NEXT:    v_mov_b32_e32 v1, s7
23; GFX7-NEXT:    flat_load_dword v2, v[0:1]
24; GFX7-NEXT:    v_mov_b32_e32 v0, s4
25; GFX7-NEXT:    v_mov_b32_e32 v1, s5
26; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
27; GFX7-NEXT:    flat_store_dword v[0:1], v2
28; GFX7-NEXT:    s_endpgm
29;
30; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
31; GFX10-WGP:       ; %bb.0: ; %entry
32; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
33; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
34; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
36; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
37; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
38; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
39; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
40; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
41; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
42; GFX10-WGP-NEXT:    s_endpgm
43;
44; GFX10-CU-LABEL: flat_workgroup_unordered_load:
45; GFX10-CU:       ; %bb.0: ; %entry
46; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
47; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
48; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
50; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
51; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
53; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
54; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
55; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
56; GFX10-CU-NEXT:    s_endpgm
57;
58; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load:
59; SKIP-CACHE-INV:       ; %bb.0: ; %entry
60; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
64; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
65; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
66; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
67; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
68; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
69; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
70; SKIP-CACHE-INV-NEXT:    s_endpgm
71;
72; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
73; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
74; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
75; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
76; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
78; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
79; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
80; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
81; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
82; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
83;
84; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
85; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
86; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
87; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
90; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
91; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
92; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
93; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
94; GFX90A-TGSPLIT-NEXT:    s_endpgm
95;
96; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
97; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
98; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
99; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
100; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
102; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
103; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
104; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
105; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
106; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
107;
108; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load:
109; GFX940-TGSPLIT:       ; %bb.0: ; %entry
110; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
111; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
112; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
114; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
115; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
116; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
117; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
118; GFX940-TGSPLIT-NEXT:    s_endpgm
119;
120; GFX11-WGP-LABEL: flat_workgroup_unordered_load:
121; GFX11-WGP:       ; %bb.0: ; %entry
122; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
123; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
124; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
126; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
127; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
128; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
129; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
130; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
132; GFX11-WGP-NEXT:    s_endpgm
133;
134; GFX11-CU-LABEL: flat_workgroup_unordered_load:
135; GFX11-CU:       ; %bb.0: ; %entry
136; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
137; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
138; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
140; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
141; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
142; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
143; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
144; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
145; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
146; GFX11-CU-NEXT:    s_endpgm
147;
148; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
149; GFX12-WGP:       ; %bb.0: ; %entry
150; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
151; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
152; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
153; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
154; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
155; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
156; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
157; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
158; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
159; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
160; GFX12-WGP-NEXT:    s_endpgm
161;
162; GFX12-CU-LABEL: flat_workgroup_unordered_load:
163; GFX12-CU:       ; %bb.0: ; %entry
164; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
165; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
166; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
167; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
168; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
169; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
170; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
171; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
172; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
173; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
174; GFX12-CU-NEXT:    s_endpgm
175    ptr %in, ptr %out) {
176entry:
177  %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4
178  store i32 %val, ptr %out
179  ret void
180}
181
182define amdgpu_kernel void @flat_workgroup_monotonic_load(
183; GFX7-LABEL: flat_workgroup_monotonic_load:
184; GFX7:       ; %bb.0: ; %entry
185; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
186; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX7-NEXT:    v_mov_b32_e32 v0, s6
189; GFX7-NEXT:    v_mov_b32_e32 v1, s7
190; GFX7-NEXT:    flat_load_dword v2, v[0:1]
191; GFX7-NEXT:    v_mov_b32_e32 v0, s4
192; GFX7-NEXT:    v_mov_b32_e32 v1, s5
193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
194; GFX7-NEXT:    flat_store_dword v[0:1], v2
195; GFX7-NEXT:    s_endpgm
196;
197; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
198; GFX10-WGP:       ; %bb.0: ; %entry
199; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
200; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
201; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
202; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
203; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
204; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
205; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
206; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
207; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
208; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
209; GFX10-WGP-NEXT:    s_endpgm
210;
211; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
212; GFX10-CU:       ; %bb.0: ; %entry
213; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
214; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
215; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
217; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
218; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
219; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
220; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
221; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
222; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
223; GFX10-CU-NEXT:    s_endpgm
224;
225; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load:
226; SKIP-CACHE-INV:       ; %bb.0: ; %entry
227; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
229; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
232; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
235; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
236; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
237; SKIP-CACHE-INV-NEXT:    s_endpgm
238;
239; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
240; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
241; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
242; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
243; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
245; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
246; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
247; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
248; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
249; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
250;
251; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
252; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
253; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
254; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
257; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
259; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
260; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
261; GFX90A-TGSPLIT-NEXT:    s_endpgm
262;
263; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
264; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
265; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
266; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
267; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
269; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
271; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
273; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
274;
275; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
276; GFX940-TGSPLIT:       ; %bb.0: ; %entry
277; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
278; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
279; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
281; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
283; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
284; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
285; GFX940-TGSPLIT-NEXT:    s_endpgm
286;
287; GFX11-WGP-LABEL: flat_workgroup_monotonic_load:
288; GFX11-WGP:       ; %bb.0: ; %entry
289; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
290; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
291; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
293; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
294; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
295; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
296; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
297; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
299; GFX11-WGP-NEXT:    s_endpgm
300;
301; GFX11-CU-LABEL: flat_workgroup_monotonic_load:
302; GFX11-CU:       ; %bb.0: ; %entry
303; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
304; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
305; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
307; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
308; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
309; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
310; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
311; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
312; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
313; GFX11-CU-NEXT:    s_endpgm
314;
315; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
316; GFX12-WGP:       ; %bb.0: ; %entry
317; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
318; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
319; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
320; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
321; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
322; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
323; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
324; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
325; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
326; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
327; GFX12-WGP-NEXT:    s_endpgm
328;
329; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
330; GFX12-CU:       ; %bb.0: ; %entry
331; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
332; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
333; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
334; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
335; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
336; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
337; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
338; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
339; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
340; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
341; GFX12-CU-NEXT:    s_endpgm
342    ptr %in, ptr %out) {
343entry:
344  %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4
345  store i32 %val, ptr %out
346  ret void
347}
348
349define amdgpu_kernel void @flat_workgroup_acquire_load(
350; GFX7-LABEL: flat_workgroup_acquire_load:
351; GFX7:       ; %bb.0: ; %entry
352; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
353; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    v_mov_b32_e32 v0, s6
356; GFX7-NEXT:    v_mov_b32_e32 v1, s7
357; GFX7-NEXT:    flat_load_dword v2, v[0:1]
358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX7-NEXT:    v_mov_b32_e32 v0, s4
360; GFX7-NEXT:    v_mov_b32_e32 v1, s5
361; GFX7-NEXT:    s_waitcnt vmcnt(0)
362; GFX7-NEXT:    flat_store_dword v[0:1], v2
363; GFX7-NEXT:    s_endpgm
364;
365; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
366; GFX10-WGP:       ; %bb.0: ; %entry
367; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
368; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
369; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
371; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
372; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
373; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
374; GFX10-WGP-NEXT:    buffer_gl0_inv
375; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
376; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
377; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
378; GFX10-WGP-NEXT:    s_endpgm
379;
380; GFX10-CU-LABEL: flat_workgroup_acquire_load:
381; GFX10-CU:       ; %bb.0: ; %entry
382; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
383; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
384; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
386; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
387; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
388; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
390; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
391; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
392; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
393; GFX10-CU-NEXT:    s_endpgm
394;
395; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load:
396; SKIP-CACHE-INV:       ; %bb.0: ; %entry
397; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
398; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
399; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
401; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
402; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
403; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
404; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
405; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
406; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
407; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
408; SKIP-CACHE-INV-NEXT:    s_endpgm
409;
410; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
411; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
412; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
413; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
414; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
416; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
417; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
419; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
420; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
421; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
422;
423; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
424; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
425; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
426; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
427; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
429; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
430; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
431; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
432; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
433; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
434; GFX90A-TGSPLIT-NEXT:    s_endpgm
435;
436; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
437; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
438; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
439; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
440; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
442; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
443; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
445; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
446; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
447; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
448;
449; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load:
450; GFX940-TGSPLIT:       ; %bb.0: ; %entry
451; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
452; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
453; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
455; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
456; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
457; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
458; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
459; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
460; GFX940-TGSPLIT-NEXT:    s_endpgm
461;
462; GFX11-WGP-LABEL: flat_workgroup_acquire_load:
463; GFX11-WGP:       ; %bb.0: ; %entry
464; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
465; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
466; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
468; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
469; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
470; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
471; GFX11-WGP-NEXT:    buffer_gl0_inv
472; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
473; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
474; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
475; GFX11-WGP-NEXT:    s_endpgm
476;
477; GFX11-CU-LABEL: flat_workgroup_acquire_load:
478; GFX11-CU:       ; %bb.0: ; %entry
479; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
480; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
481; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
483; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
484; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
485; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
487; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
488; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
489; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
490; GFX11-CU-NEXT:    s_endpgm
491;
492; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
493; GFX12-WGP:       ; %bb.0: ; %entry
494; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
495; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
496; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
497; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
498; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
499; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
500; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
501; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
502; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
503; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
504; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
505; GFX12-WGP-NEXT:    s_endpgm
506;
507; GFX12-CU-LABEL: flat_workgroup_acquire_load:
508; GFX12-CU:       ; %bb.0: ; %entry
509; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
510; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
511; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
512; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
513; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
514; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
515; GFX12-CU-NEXT:    s_wait_dscnt 0x0
516; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
517; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
518; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
519; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
520; GFX12-CU-NEXT:    s_endpgm
521    ptr %in, ptr %out) {
522entry:
523  %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4
524  store i32 %val, ptr %out
525  ret void
526}
527
528define amdgpu_kernel void @flat_workgroup_seq_cst_load(
529; GFX7-LABEL: flat_workgroup_seq_cst_load:
530; GFX7:       ; %bb.0: ; %entry
531; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
532; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
533; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX7-NEXT:    v_mov_b32_e32 v0, s6
535; GFX7-NEXT:    v_mov_b32_e32 v1, s7
536; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX7-NEXT:    flat_load_dword v2, v[0:1]
538; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX7-NEXT:    v_mov_b32_e32 v0, s4
540; GFX7-NEXT:    v_mov_b32_e32 v1, s5
541; GFX7-NEXT:    s_waitcnt vmcnt(0)
542; GFX7-NEXT:    flat_store_dword v[0:1], v2
543; GFX7-NEXT:    s_endpgm
544;
545; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
546; GFX10-WGP:       ; %bb.0: ; %entry
547; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
548; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
549; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
551; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
552; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
553; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
554; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
555; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
556; GFX10-WGP-NEXT:    buffer_gl0_inv
557; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
558; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
559; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
560; GFX10-WGP-NEXT:    s_endpgm
561;
562; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
563; GFX10-CU:       ; %bb.0: ; %entry
564; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
565; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
566; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
568; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
569; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
570; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
571; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
573; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
574; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
575; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
576; GFX10-CU-NEXT:    s_endpgm
577;
578; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load:
579; SKIP-CACHE-INV:       ; %bb.0: ; %entry
580; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
581; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
582; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
583; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
584; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
585; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
586; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
587; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
588; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
590; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
591; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
592; SKIP-CACHE-INV-NEXT:    s_endpgm
593;
594; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
595; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
596; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
597; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
598; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
600; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
602; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
604; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
605; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
606; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
607;
608; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
609; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
610; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
611; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
612; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
614; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
615; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
616; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
617; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
618; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
619; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
620; GFX90A-TGSPLIT-NEXT:    s_endpgm
621;
622; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
623; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
624; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
625; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
626; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
628; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
630; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
632; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
633; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
634; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
635;
636; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
637; GFX940-TGSPLIT:       ; %bb.0: ; %entry
638; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
639; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
640; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
642; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
643; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
644; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
645; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
646; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
647; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
648; GFX940-TGSPLIT-NEXT:    s_endpgm
649;
650; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load:
651; GFX11-WGP:       ; %bb.0: ; %entry
652; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
653; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
654; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
656; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
657; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
658; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
659; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
660; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
661; GFX11-WGP-NEXT:    buffer_gl0_inv
662; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
663; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
664; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
665; GFX11-WGP-NEXT:    s_endpgm
666;
667; GFX11-CU-LABEL: flat_workgroup_seq_cst_load:
668; GFX11-CU:       ; %bb.0: ; %entry
669; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
670; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
671; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
673; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
674; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
676; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
678; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
679; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
680; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
681; GFX11-CU-NEXT:    s_endpgm
682;
683; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
684; GFX12-WGP:       ; %bb.0: ; %entry
685; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
686; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
687; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
688; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
689; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
690; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
691; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
692; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
693; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
694; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
695; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
696; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
697; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
698; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
699; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
700; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
701; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
702; GFX12-WGP-NEXT:    s_endpgm
703;
704; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
705; GFX12-CU:       ; %bb.0: ; %entry
706; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
707; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
708; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
709; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
710; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
711; GFX12-CU-NEXT:    s_wait_dscnt 0x0
712; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
713; GFX12-CU-NEXT:    s_wait_dscnt 0x0
714; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
715; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
716; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
717; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
718; GFX12-CU-NEXT:    s_endpgm
719    ptr %in, ptr %out) {
720entry:
721  %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
722  store i32 %val, ptr %out
723  ret void
724}
725
726define amdgpu_kernel void @flat_workgroup_unordered_store(
727; GFX7-LABEL: flat_workgroup_unordered_store:
728; GFX7:       ; %bb.0: ; %entry
729; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
730; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
731; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX7-NEXT:    v_mov_b32_e32 v0, s6
733; GFX7-NEXT:    v_mov_b32_e32 v1, s7
734; GFX7-NEXT:    v_mov_b32_e32 v2, s4
735; GFX7-NEXT:    flat_store_dword v[0:1], v2
736; GFX7-NEXT:    s_endpgm
737;
738; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
739; GFX10-WGP:       ; %bb.0: ; %entry
740; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
741; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
742; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
744; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
745; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
746; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
747; GFX10-WGP-NEXT:    s_endpgm
748;
749; GFX10-CU-LABEL: flat_workgroup_unordered_store:
750; GFX10-CU:       ; %bb.0: ; %entry
751; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
752; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
753; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
755; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
756; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
757; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
758; GFX10-CU-NEXT:    s_endpgm
759;
760; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store:
761; SKIP-CACHE-INV:       ; %bb.0: ; %entry
762; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
763; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
764; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
767; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
768; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
769; SKIP-CACHE-INV-NEXT:    s_endpgm
770;
771; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
772; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
774; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
775; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
777; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
778; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
779; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
780;
781; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
782; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
783; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
784; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
785; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
787; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
788; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
789; GFX90A-TGSPLIT-NEXT:    s_endpgm
790;
791; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
792; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
793; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
794; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
795; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
797; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
798; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
799; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
800;
801; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store:
802; GFX940-TGSPLIT:       ; %bb.0: ; %entry
803; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
804; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
805; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
807; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
808; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
809; GFX940-TGSPLIT-NEXT:    s_endpgm
810;
811; GFX11-WGP-LABEL: flat_workgroup_unordered_store:
812; GFX11-WGP:       ; %bb.0: ; %entry
813; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
814; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
815; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
817; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
818; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
819; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
820; GFX11-WGP-NEXT:    s_endpgm
821;
822; GFX11-CU-LABEL: flat_workgroup_unordered_store:
823; GFX11-CU:       ; %bb.0: ; %entry
824; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
825; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
826; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
828; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
829; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
830; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
831; GFX11-CU-NEXT:    s_endpgm
832;
833; GFX12-WGP-LABEL: flat_workgroup_unordered_store:
834; GFX12-WGP:       ; %bb.0: ; %entry
835; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
836; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
837; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
838; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
839; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
840; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
841; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
842; GFX12-WGP-NEXT:    s_endpgm
843;
844; GFX12-CU-LABEL: flat_workgroup_unordered_store:
845; GFX12-CU:       ; %bb.0: ; %entry
846; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
847; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
848; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
849; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
850; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
851; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
852; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
853; GFX12-CU-NEXT:    s_endpgm
854    i32 %in, ptr %out) {
855entry:
856  store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
857  ret void
858}
859
860define amdgpu_kernel void @flat_workgroup_monotonic_store(
861; GFX7-LABEL: flat_workgroup_monotonic_store:
862; GFX7:       ; %bb.0: ; %entry
863; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
864; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
865; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX7-NEXT:    v_mov_b32_e32 v0, s6
867; GFX7-NEXT:    v_mov_b32_e32 v1, s7
868; GFX7-NEXT:    v_mov_b32_e32 v2, s4
869; GFX7-NEXT:    flat_store_dword v[0:1], v2
870; GFX7-NEXT:    s_endpgm
871;
872; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
873; GFX10-WGP:       ; %bb.0: ; %entry
874; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
875; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
876; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
878; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
879; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
880; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
881; GFX10-WGP-NEXT:    s_endpgm
882;
883; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
884; GFX10-CU:       ; %bb.0: ; %entry
885; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
886; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
887; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
889; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
890; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
891; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
892; GFX10-CU-NEXT:    s_endpgm
893;
894; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store:
895; SKIP-CACHE-INV:       ; %bb.0: ; %entry
896; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
897; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
898; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
900; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
901; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
902; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
903; SKIP-CACHE-INV-NEXT:    s_endpgm
904;
905; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
906; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
907; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
909; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
911; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
912; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
913; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
914;
915; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
916; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
917; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
918; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
919; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
920; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
921; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
922; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
923; GFX90A-TGSPLIT-NEXT:    s_endpgm
924;
925; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
926; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
927; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
928; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
929; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
931; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
932; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
933; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
934;
935; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
936; GFX940-TGSPLIT:       ; %bb.0: ; %entry
937; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
938; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
939; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
941; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
942; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
943; GFX940-TGSPLIT-NEXT:    s_endpgm
944;
945; GFX11-WGP-LABEL: flat_workgroup_monotonic_store:
946; GFX11-WGP:       ; %bb.0: ; %entry
947; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
948; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
949; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
951; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
952; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
953; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
954; GFX11-WGP-NEXT:    s_endpgm
955;
956; GFX11-CU-LABEL: flat_workgroup_monotonic_store:
957; GFX11-CU:       ; %bb.0: ; %entry
958; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
959; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
960; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
961; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
962; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
963; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
964; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
965; GFX11-CU-NEXT:    s_endpgm
966;
967; GFX12-WGP-LABEL: flat_workgroup_monotonic_store:
968; GFX12-WGP:       ; %bb.0: ; %entry
969; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
970; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
971; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
972; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
973; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
974; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
975; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
976; GFX12-WGP-NEXT:    s_endpgm
977;
978; GFX12-CU-LABEL: flat_workgroup_monotonic_store:
979; GFX12-CU:       ; %bb.0: ; %entry
980; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
981; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
982; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
983; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
984; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
985; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
986; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
987; GFX12-CU-NEXT:    s_endpgm
988    i32 %in, ptr %out) {
989entry:
990  store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
991  ret void
992}
993
994define amdgpu_kernel void @flat_workgroup_release_store(
995; GFX7-LABEL: flat_workgroup_release_store:
996; GFX7:       ; %bb.0: ; %entry
997; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
998; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
999; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1001; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1002; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1003; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX7-NEXT:    flat_store_dword v[0:1], v2
1005; GFX7-NEXT:    s_endpgm
1006;
1007; GFX10-WGP-LABEL: flat_workgroup_release_store:
1008; GFX10-WGP:       ; %bb.0: ; %entry
1009; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
1010; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1011; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1012; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1013; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1014; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1015; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1016; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1017; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1018; GFX10-WGP-NEXT:    s_endpgm
1019;
1020; GFX10-CU-LABEL: flat_workgroup_release_store:
1021; GFX10-CU:       ; %bb.0: ; %entry
1022; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
1023; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1024; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1026; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1027; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1028; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1030; GFX10-CU-NEXT:    s_endpgm
1031;
1032; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store:
1033; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1034; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
1035; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1036; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1037; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1038; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1039; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1040; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1041; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1042; SKIP-CACHE-INV-NEXT:    s_endpgm
1043;
1044; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
1045; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1046; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1047; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1048; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1050; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1051; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1053; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1054;
1055; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
1056; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1057; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1058; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1059; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1061; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1062; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1063; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1064; GFX90A-TGSPLIT-NEXT:    s_endpgm
1065;
1066; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
1067; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1068; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1069; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1070; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1072; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1073; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1074; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1075; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1076;
1077; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store:
1078; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1079; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1080; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1081; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1083; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1084; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1085; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1086; GFX940-TGSPLIT-NEXT:    s_endpgm
1087;
1088; GFX11-WGP-LABEL: flat_workgroup_release_store:
1089; GFX11-WGP:       ; %bb.0: ; %entry
1090; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1091; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1092; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1094; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1095; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1096; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1097; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1098; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1099; GFX11-WGP-NEXT:    s_endpgm
1100;
1101; GFX11-CU-LABEL: flat_workgroup_release_store:
1102; GFX11-CU:       ; %bb.0: ; %entry
1103; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1104; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1105; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1107; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1108; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1109; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1111; GFX11-CU-NEXT:    s_endpgm
1112;
1113; GFX12-WGP-LABEL: flat_workgroup_release_store:
1114; GFX12-WGP:       ; %bb.0: ; %entry
1115; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1116; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1117; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1118; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1119; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1120; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1121; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1122; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1123; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1124; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1125; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
1126; GFX12-WGP-NEXT:    s_endpgm
1127;
1128; GFX12-CU-LABEL: flat_workgroup_release_store:
1129; GFX12-CU:       ; %bb.0: ; %entry
1130; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1131; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1132; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1133; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1134; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1135; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1136; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1137; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
1138; GFX12-CU-NEXT:    s_endpgm
1139    i32 %in, ptr %out) {
1140entry:
1141  store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
1142  ret void
1143}
1144
1145define amdgpu_kernel void @flat_workgroup_seq_cst_store(
1146; GFX7-LABEL: flat_workgroup_seq_cst_store:
1147; GFX7:       ; %bb.0: ; %entry
1148; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1149; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1150; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1152; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1153; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1154; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX7-NEXT:    flat_store_dword v[0:1], v2
1156; GFX7-NEXT:    s_endpgm
1157;
1158; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
1159; GFX10-WGP:       ; %bb.0: ; %entry
1160; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
1161; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1162; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1164; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1165; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1166; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1168; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1169; GFX10-WGP-NEXT:    s_endpgm
1170;
1171; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
1172; GFX10-CU:       ; %bb.0: ; %entry
1173; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
1174; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1175; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1176; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1177; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1178; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1179; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1181; GFX10-CU-NEXT:    s_endpgm
1182;
1183; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store:
1184; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1185; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
1186; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1187; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1191; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1192; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1193; SKIP-CACHE-INV-NEXT:    s_endpgm
1194;
1195; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1196; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1197; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1198; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1199; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1201; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1204; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1205;
1206; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1207; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1208; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1209; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1210; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1211; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1212; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1213; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1214; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1215; GFX90A-TGSPLIT-NEXT:    s_endpgm
1216;
1217; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1218; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1219; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1220; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1221; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1223; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1224; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1225; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1226; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1227;
1228; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1229; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1230; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1231; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1232; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1234; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1235; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1236; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1237; GFX940-TGSPLIT-NEXT:    s_endpgm
1238;
1239; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store:
1240; GFX11-WGP:       ; %bb.0: ; %entry
1241; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1242; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1243; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1245; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1246; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1247; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1248; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1249; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1250; GFX11-WGP-NEXT:    s_endpgm
1251;
1252; GFX11-CU-LABEL: flat_workgroup_seq_cst_store:
1253; GFX11-CU:       ; %bb.0: ; %entry
1254; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1255; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1256; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1257; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1258; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1259; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1260; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1262; GFX11-CU-NEXT:    s_endpgm
1263;
1264; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store:
1265; GFX12-WGP:       ; %bb.0: ; %entry
1266; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1267; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1268; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1269; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1270; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1271; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1272; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1273; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1274; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1275; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1276; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
1277; GFX12-WGP-NEXT:    s_endpgm
1278;
1279; GFX12-CU-LABEL: flat_workgroup_seq_cst_store:
1280; GFX12-CU:       ; %bb.0: ; %entry
1281; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1282; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1283; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1284; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1285; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1286; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1287; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1288; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
1289; GFX12-CU-NEXT:    s_endpgm
1290    i32 %in, ptr %out) {
1291entry:
1292  store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
1293  ret void
1294}
1295
1296define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
1297; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
1298; GFX7:       ; %bb.0: ; %entry
1299; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1300; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1301; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1302; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1303; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1304; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1305; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1306; GFX7-NEXT:    s_endpgm
1307;
1308; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1309; GFX10-WGP:       ; %bb.0: ; %entry
1310; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1311; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1312; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1313; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1314; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1315; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1316; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1317; GFX10-WGP-NEXT:    s_endpgm
1318;
1319; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1320; GFX10-CU:       ; %bb.0: ; %entry
1321; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1322; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1323; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1324; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1325; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1326; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1327; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1328; GFX10-CU-NEXT:    s_endpgm
1329;
1330; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw:
1331; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1332; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1333; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1334; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1336; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1338; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1339; SKIP-CACHE-INV-NEXT:    s_endpgm
1340;
1341; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1342; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1343; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1344; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1345; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1346; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1347; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1348; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1349; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1350;
1351; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1352; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1354; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1355; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1358; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1359; GFX90A-TGSPLIT-NEXT:    s_endpgm
1360;
1361; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1362; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1363; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1364; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1365; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1366; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1367; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1368; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1369; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1370;
1371; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1372; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1373; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1374; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1375; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1377; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1378; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1379; GFX940-TGSPLIT-NEXT:    s_endpgm
1380;
1381; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1382; GFX11-WGP:       ; %bb.0: ; %entry
1383; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1384; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1385; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1386; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1387; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1388; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1389; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1390; GFX11-WGP-NEXT:    s_endpgm
1391;
1392; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1393; GFX11-CU:       ; %bb.0: ; %entry
1394; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1395; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1396; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1398; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1399; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1400; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1401; GFX11-CU-NEXT:    s_endpgm
1402;
1403; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1404; GFX12-WGP:       ; %bb.0: ; %entry
1405; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1406; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1407; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1408; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1409; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1410; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1411; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1412; GFX12-WGP-NEXT:    s_endpgm
1413;
1414; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1415; GFX12-CU:       ; %bb.0: ; %entry
1416; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1417; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1418; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1419; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1420; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1421; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1422; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1423; GFX12-CU-NEXT:    s_endpgm
1424    ptr %out, i32 %in) {
1425entry:
1426  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
1427  ret void
1428}
1429
1430define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
1431; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
1432; GFX7:       ; %bb.0: ; %entry
1433; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1434; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1435; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1437; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1438; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1439; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1440; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX7-NEXT:    s_endpgm
1442;
1443; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1444; GFX10-WGP:       ; %bb.0: ; %entry
1445; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1446; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1447; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1448; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1449; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1450; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1451; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1452; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1454; GFX10-WGP-NEXT:    buffer_gl0_inv
1455; GFX10-WGP-NEXT:    s_endpgm
1456;
1457; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1458; GFX10-CU:       ; %bb.0: ; %entry
1459; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1460; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1461; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1462; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1463; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1464; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1465; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1466; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1467; GFX10-CU-NEXT:    s_endpgm
1468;
1469; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw:
1470; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1471; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1472; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1473; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1475; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1476; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1477; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1478; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1479; SKIP-CACHE-INV-NEXT:    s_endpgm
1480;
1481; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1482; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1483; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1484; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1485; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1487; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1488; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1489; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1491;
1492; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1493; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1494; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1495; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1496; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1498; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1499; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1500; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1501; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1502; GFX90A-TGSPLIT-NEXT:    s_endpgm
1503;
1504; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1505; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1506; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1507; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1508; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1510; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1511; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1512; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1514;
1515; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1516; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1517; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1518; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1519; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1520; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1521; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1522; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1523; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1524; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
1525; GFX940-TGSPLIT-NEXT:    s_endpgm
1526;
1527; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1528; GFX11-WGP:       ; %bb.0: ; %entry
1529; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1530; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1531; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1532; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1533; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1534; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1535; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1536; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1537; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1538; GFX11-WGP-NEXT:    buffer_gl0_inv
1539; GFX11-WGP-NEXT:    s_endpgm
1540;
1541; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1542; GFX11-CU:       ; %bb.0: ; %entry
1543; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1544; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1545; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1546; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1547; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1548; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1549; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1550; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1551; GFX11-CU-NEXT:    s_endpgm
1552;
1553; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1554; GFX12-WGP:       ; %bb.0: ; %entry
1555; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1556; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1557; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1558; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1559; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1560; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1561; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1562; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
1563; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
1564; GFX12-WGP-NEXT:    s_endpgm
1565;
1566; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1567; GFX12-CU:       ; %bb.0: ; %entry
1568; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1569; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1570; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1571; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1572; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1573; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1574; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1575; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1576; GFX12-CU-NEXT:    s_endpgm
1577    ptr %out, i32 %in) {
1578entry:
1579  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
1580  ret void
1581}
1582
1583define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
1584; GFX7-LABEL: flat_workgroup_release_atomicrmw:
1585; GFX7:       ; %bb.0: ; %entry
1586; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1587; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1588; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1589; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1590; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1591; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1592; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1594; GFX7-NEXT:    s_endpgm
1595;
1596; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
1597; GFX10-WGP:       ; %bb.0: ; %entry
1598; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1599; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1600; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1602; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1603; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1604; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1605; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1606; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1607; GFX10-WGP-NEXT:    s_endpgm
1608;
1609; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
1610; GFX10-CU:       ; %bb.0: ; %entry
1611; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1612; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1613; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1615; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1616; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1617; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1619; GFX10-CU-NEXT:    s_endpgm
1620;
1621; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw:
1622; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1623; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1624; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1625; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1626; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1627; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1628; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1629; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1630; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1631; SKIP-CACHE-INV-NEXT:    s_endpgm
1632;
1633; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1634; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1635; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1636; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1637; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1638; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1639; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1640; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1641; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1642; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1643;
1644; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1645; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1646; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1647; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1648; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1650; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1651; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1652; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1653; GFX90A-TGSPLIT-NEXT:    s_endpgm
1654;
1655; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1656; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1657; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1658; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1659; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1660; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1661; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1662; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1664; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1665;
1666; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1667; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1668; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1669; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1670; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1671; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1672; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1673; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1674; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1675; GFX940-TGSPLIT-NEXT:    s_endpgm
1676;
1677; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw:
1678; GFX11-WGP:       ; %bb.0: ; %entry
1679; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1680; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1681; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1682; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1683; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1684; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1685; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1686; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1687; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1688; GFX11-WGP-NEXT:    s_endpgm
1689;
1690; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw:
1691; GFX11-CU:       ; %bb.0: ; %entry
1692; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1693; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1694; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1696; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1697; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1698; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1700; GFX11-CU-NEXT:    s_endpgm
1701;
1702; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
1703; GFX12-WGP:       ; %bb.0: ; %entry
1704; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1705; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1706; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1707; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1708; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1709; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1710; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1711; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1712; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1713; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1714; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1715; GFX12-WGP-NEXT:    s_endpgm
1716;
1717; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
1718; GFX12-CU:       ; %bb.0: ; %entry
1719; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1720; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1721; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1722; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1723; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1724; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1725; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1726; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1727; GFX12-CU-NEXT:    s_endpgm
1728    ptr %out, i32 %in) {
1729entry:
1730  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
1731  ret void
1732}
1733
1734define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
1735; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
1736; GFX7:       ; %bb.0: ; %entry
1737; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1738; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1739; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1741; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1742; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1743; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1744; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1745; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX7-NEXT:    s_endpgm
1747;
1748; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1749; GFX10-WGP:       ; %bb.0: ; %entry
1750; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1751; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1752; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1754; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1755; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1756; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1757; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1758; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1759; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1760; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1761; GFX10-WGP-NEXT:    buffer_gl0_inv
1762; GFX10-WGP-NEXT:    s_endpgm
1763;
1764; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1765; GFX10-CU:       ; %bb.0: ; %entry
1766; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1767; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1768; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1769; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1770; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1771; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1772; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1774; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1775; GFX10-CU-NEXT:    s_endpgm
1776;
1777; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
1778; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1779; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1780; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1781; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1785; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1786; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1787; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1788; SKIP-CACHE-INV-NEXT:    s_endpgm
1789;
1790; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1791; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1792; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1793; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1794; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1796; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1797; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1799; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1801;
1802; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1803; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1804; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1805; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1806; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1808; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1809; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1810; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1811; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1812; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1813; GFX90A-TGSPLIT-NEXT:    s_endpgm
1814;
1815; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1816; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1817; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1818; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1819; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1821; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1822; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1823; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1824; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1826;
1827; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1828; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1829; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1830; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1831; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1833; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1834; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1835; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1836; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1837; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
1838; GFX940-TGSPLIT-NEXT:    s_endpgm
1839;
1840; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1841; GFX11-WGP:       ; %bb.0: ; %entry
1842; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1843; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1844; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1845; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1846; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1847; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1848; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1849; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1850; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1851; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1852; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1853; GFX11-WGP-NEXT:    buffer_gl0_inv
1854; GFX11-WGP-NEXT:    s_endpgm
1855;
1856; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1857; GFX11-CU:       ; %bb.0: ; %entry
1858; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1859; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1860; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1862; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1863; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1864; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1866; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1867; GFX11-CU-NEXT:    s_endpgm
1868;
1869; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1870; GFX12-WGP:       ; %bb.0: ; %entry
1871; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1872; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1873; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1874; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1875; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1876; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1877; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1878; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1879; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1880; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1881; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1882; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
1883; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
1884; GFX12-WGP-NEXT:    s_endpgm
1885;
1886; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1887; GFX12-CU:       ; %bb.0: ; %entry
1888; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1889; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1890; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1891; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1892; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1893; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1894; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1895; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1896; GFX12-CU-NEXT:    s_wait_dscnt 0x0
1897; GFX12-CU-NEXT:    s_endpgm
1898    ptr %out, i32 %in) {
1899entry:
1900  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
1901  ret void
1902}
1903
1904define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
1905; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
1906; GFX7:       ; %bb.0: ; %entry
1907; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1908; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1909; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1911; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1912; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1913; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1914; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1915; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1916; GFX7-NEXT:    s_endpgm
1917;
1918; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
1919; GFX10-WGP:       ; %bb.0: ; %entry
1920; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1921; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1922; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1924; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1925; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1926; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1927; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1928; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1929; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1930; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1931; GFX10-WGP-NEXT:    buffer_gl0_inv
1932; GFX10-WGP-NEXT:    s_endpgm
1933;
1934; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
1935; GFX10-CU:       ; %bb.0: ; %entry
1936; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1937; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1938; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1939; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1940; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1941; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1942; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1943; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1944; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1945; GFX10-CU-NEXT:    s_endpgm
1946;
1947; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
1948; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1949; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1950; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1951; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1952; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1953; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1954; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1955; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1956; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1957; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1958; SKIP-CACHE-INV-NEXT:    s_endpgm
1959;
1960; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1961; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1962; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1963; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1964; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1965; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1966; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1967; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1969; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1971;
1972; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1973; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1974; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1975; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1976; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1977; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1978; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1979; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1980; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1981; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1982; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1983; GFX90A-TGSPLIT-NEXT:    s_endpgm
1984;
1985; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1986; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1987; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1988; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1989; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1991; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1992; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1994; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1995; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1996;
1997; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1998; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1999; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2000; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
2001; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2002; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2003; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
2004; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2005; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
2006; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2007; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
2008; GFX940-TGSPLIT-NEXT:    s_endpgm
2009;
2010; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
2011; GFX11-WGP:       ; %bb.0: ; %entry
2012; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2013; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
2014; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
2016; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
2017; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
2018; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2019; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2020; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
2021; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2022; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2023; GFX11-WGP-NEXT:    buffer_gl0_inv
2024; GFX11-WGP-NEXT:    s_endpgm
2025;
2026; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
2027; GFX11-CU:       ; %bb.0: ; %entry
2028; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2029; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
2030; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2031; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
2032; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
2033; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
2034; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2035; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
2036; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX11-CU-NEXT:    s_endpgm
2038;
2039; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
2040; GFX12-WGP:       ; %bb.0: ; %entry
2041; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2042; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
2043; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2044; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
2045; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
2046; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
2047; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2048; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2049; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2050; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2051; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
2052; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
2053; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
2054; GFX12-WGP-NEXT:    s_endpgm
2055;
2056; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
2057; GFX12-CU:       ; %bb.0: ; %entry
2058; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2059; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
2060; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2061; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
2062; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
2063; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
2064; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2065; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
2066; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2067; GFX12-CU-NEXT:    s_endpgm
2068    ptr %out, i32 %in) {
2069entry:
2070  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
2071  ret void
2072}
2073
2074define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
2075; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2076; GFX7:       ; %bb.0: ; %entry
2077; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2078; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2079; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2080; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2081; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2082; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2083; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2084; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2086; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2087; GFX7-NEXT:    s_waitcnt vmcnt(0)
2088; GFX7-NEXT:    flat_store_dword v[0:1], v2
2089; GFX7-NEXT:    s_endpgm
2090;
2091; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2092; GFX10-WGP:       ; %bb.0: ; %entry
2093; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2094; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2095; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2097; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2098; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2099; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2100; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2101; GFX10-WGP-NEXT:    buffer_gl0_inv
2102; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2103; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2104; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2105; GFX10-WGP-NEXT:    s_endpgm
2106;
2107; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2108; GFX10-CU:       ; %bb.0: ; %entry
2109; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2110; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2111; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2112; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2113; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2114; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2115; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2116; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2117; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2118; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2119; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2120; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2121; GFX10-CU-NEXT:    s_endpgm
2122;
2123; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2124; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2125; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2126; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2127; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2128; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2129; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2130; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2131; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2132; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2133; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2134; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2135; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2136; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2137; SKIP-CACHE-INV-NEXT:    s_endpgm
2138;
2139; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2140; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2141; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2142; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2143; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2144; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2145; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2146; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2147; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2149; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2150; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2151; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2152;
2153; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2154; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2155; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2156; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2157; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2158; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2159; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2160; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2161; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2162; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2163; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2164; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2165; GFX90A-TGSPLIT-NEXT:    s_endpgm
2166;
2167; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2168; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2169; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2170; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2171; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2172; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2173; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2174; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2175; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2176; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2177; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2178; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2179; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2180;
2181; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2182; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2183; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2184; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2185; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2187; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2188; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2189; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2190; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
2191; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2192; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2193; GFX940-TGSPLIT-NEXT:    s_endpgm
2194;
2195; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2196; GFX11-WGP:       ; %bb.0: ; %entry
2197; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2198; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2199; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2200; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2201; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2202; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2203; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2204; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2205; GFX11-WGP-NEXT:    buffer_gl0_inv
2206; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2207; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2208; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2209; GFX11-WGP-NEXT:    s_endpgm
2210;
2211; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2212; GFX11-CU:       ; %bb.0: ; %entry
2213; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2214; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2215; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2216; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2217; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2218; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2219; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2220; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2222; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2223; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2224; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2225; GFX11-CU-NEXT:    s_endpgm
2226;
2227; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2228; GFX12-WGP:       ; %bb.0: ; %entry
2229; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2230; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2231; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2232; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2233; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2234; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2235; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2236; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2237; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
2238; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2239; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2240; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2241; GFX12-WGP-NEXT:    s_endpgm
2242;
2243; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2244; GFX12-CU:       ; %bb.0: ; %entry
2245; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2246; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2247; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2248; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2249; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2250; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2251; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2252; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2253; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2254; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2255; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2256; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2257; GFX12-CU-NEXT:    s_endpgm
2258    ptr %out, i32 %in) {
2259entry:
2260  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
2261  store i32 %val, ptr %out, align 4
2262  ret void
2263}
2264
2265define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
2266; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2267; GFX7:       ; %bb.0: ; %entry
2268; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2269; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2270; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2271; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2272; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2273; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2274; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2275; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2276; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2277; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2278; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2279; GFX7-NEXT:    s_waitcnt vmcnt(0)
2280; GFX7-NEXT:    flat_store_dword v[0:1], v2
2281; GFX7-NEXT:    s_endpgm
2282;
2283; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2284; GFX10-WGP:       ; %bb.0: ; %entry
2285; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2286; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2287; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2289; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2290; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2291; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2292; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2293; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2294; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2295; GFX10-WGP-NEXT:    buffer_gl0_inv
2296; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2297; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2298; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2299; GFX10-WGP-NEXT:    s_endpgm
2300;
2301; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2302; GFX10-CU:       ; %bb.0: ; %entry
2303; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2304; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2305; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2306; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2307; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2308; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2309; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2310; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2311; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2313; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2314; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2315; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2316; GFX10-CU-NEXT:    s_endpgm
2317;
2318; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2319; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2320; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2321; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2322; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2323; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2326; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2327; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2328; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2329; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2331; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2332; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2333; SKIP-CACHE-INV-NEXT:    s_endpgm
2334;
2335; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2336; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2337; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2339; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2340; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2341; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2342; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2343; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2344; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2345; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2346; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2347; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2348; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2349;
2350; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2351; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2352; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2353; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2354; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2355; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2356; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2357; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2358; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2359; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2360; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2361; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2362; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2363; GFX90A-TGSPLIT-NEXT:    s_endpgm
2364;
2365; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2366; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2367; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2368; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2369; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2370; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2371; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2372; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2373; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2374; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2375; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2376; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2377; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2378; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2379;
2380; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2381; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2382; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2383; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2384; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2385; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2386; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2387; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2388; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2389; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2390; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
2391; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2392; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2393; GFX940-TGSPLIT-NEXT:    s_endpgm
2394;
2395; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2396; GFX11-WGP:       ; %bb.0: ; %entry
2397; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2398; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2399; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2400; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2401; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2402; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2403; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2404; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2405; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2406; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2407; GFX11-WGP-NEXT:    buffer_gl0_inv
2408; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2409; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2410; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2411; GFX11-WGP-NEXT:    s_endpgm
2412;
2413; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2414; GFX11-CU:       ; %bb.0: ; %entry
2415; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2416; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2417; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2418; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2419; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2420; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2421; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2422; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2423; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2424; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2425; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2426; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2427; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2428; GFX11-CU-NEXT:    s_endpgm
2429;
2430; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2431; GFX12-WGP:       ; %bb.0: ; %entry
2432; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2433; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2434; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2435; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2436; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2437; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2438; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2439; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2440; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2441; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2442; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2443; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2444; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2445; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2446; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
2447; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2448; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2449; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2450; GFX12-WGP-NEXT:    s_endpgm
2451;
2452; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2453; GFX12-CU:       ; %bb.0: ; %entry
2454; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2455; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2456; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2457; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2458; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2459; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2460; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2461; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2462; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2463; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2464; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2465; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2466; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2467; GFX12-CU-NEXT:    s_endpgm
2468    ptr %out, i32 %in) {
2469entry:
2470  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
2471  store i32 %val, ptr %out, align 4
2472  ret void
2473}
2474
2475define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
2476; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2477; GFX7:       ; %bb.0: ; %entry
2478; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2479; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2480; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2482; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2483; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2484; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2485; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2486; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2487; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2488; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2489; GFX7-NEXT:    s_waitcnt vmcnt(0)
2490; GFX7-NEXT:    flat_store_dword v[0:1], v2
2491; GFX7-NEXT:    s_endpgm
2492;
2493; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2494; GFX10-WGP:       ; %bb.0: ; %entry
2495; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2496; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2497; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2499; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2500; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2501; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2502; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2503; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2504; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2505; GFX10-WGP-NEXT:    buffer_gl0_inv
2506; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2507; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2508; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2509; GFX10-WGP-NEXT:    s_endpgm
2510;
2511; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2512; GFX10-CU:       ; %bb.0: ; %entry
2513; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2514; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2515; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2516; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2517; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2518; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2519; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2520; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2521; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2522; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2523; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2524; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2525; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2526; GFX10-CU-NEXT:    s_endpgm
2527;
2528; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2529; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2530; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2531; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2532; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2536; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2537; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2538; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2541; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2542; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2543; SKIP-CACHE-INV-NEXT:    s_endpgm
2544;
2545; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2546; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2547; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2548; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2549; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2550; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2551; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2552; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2554; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2555; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2556; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2557; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2558; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2559;
2560; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2561; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2562; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2563; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2564; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2565; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2566; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2567; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2568; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2569; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2570; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2571; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2572; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2573; GFX90A-TGSPLIT-NEXT:    s_endpgm
2574;
2575; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2576; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2577; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2578; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2579; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2580; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2581; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2582; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2583; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2584; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2586; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2587; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2588; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2589;
2590; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2591; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2592; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2593; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2594; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2595; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2596; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2597; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2598; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2599; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2600; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
2601; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2602; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2603; GFX940-TGSPLIT-NEXT:    s_endpgm
2604;
2605; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2606; GFX11-WGP:       ; %bb.0: ; %entry
2607; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2608; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2609; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2610; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2611; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2612; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2613; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2614; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2615; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2616; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2617; GFX11-WGP-NEXT:    buffer_gl0_inv
2618; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2619; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2620; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2621; GFX11-WGP-NEXT:    s_endpgm
2622;
2623; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2624; GFX11-CU:       ; %bb.0: ; %entry
2625; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2626; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2627; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2629; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2630; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2631; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2632; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2633; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2634; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2635; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2636; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2637; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2638; GFX11-CU-NEXT:    s_endpgm
2639;
2640; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2641; GFX12-WGP:       ; %bb.0: ; %entry
2642; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2643; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2644; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2645; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2646; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2647; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2648; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2649; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2650; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2651; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2652; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2653; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2654; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2655; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2656; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
2657; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2658; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2659; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2660; GFX12-WGP-NEXT:    s_endpgm
2661;
2662; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2663; GFX12-CU:       ; %bb.0: ; %entry
2664; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2665; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2666; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2667; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2668; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2669; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2670; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2671; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2672; GFX12-CU-NEXT:    s_wait_dscnt 0x0
2673; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2674; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2675; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2676; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2677; GFX12-CU-NEXT:    s_endpgm
2678    ptr %out, i32 %in) {
2679entry:
2680  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
2681  store i32 %val, ptr %out, align 4
2682  ret void
2683}
2684
2685define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
2686; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2687; GFX7:       ; %bb.0: ; %entry
2688; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2689; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2690; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2691; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2692; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2693; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2694; GFX7-NEXT:    s_mov_b32 s4, s8
2695; GFX7-NEXT:    s_mov_b32 s5, s9
2696; GFX7-NEXT:    s_mov_b32 s9, s10
2697; GFX7-NEXT:    s_mov_b32 s8, s11
2698; GFX7-NEXT:    s_add_u32 s4, s4, s9
2699; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2700; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2701; GFX7-NEXT:    s_mov_b32 s5, s8
2702; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2703; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2704; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2705; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2706; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2707; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2708; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2709; GFX7-NEXT:    s_endpgm
2710;
2711; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2712; GFX10-WGP:       ; %bb.0: ; %entry
2713; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2714; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2715; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2716; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2717; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2718; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2719; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2720; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2721; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2722; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2723; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2724; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2725; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2726; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2727; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2728; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2729; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2730; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2731; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2732; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2733; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2734; GFX10-WGP-NEXT:    s_endpgm
2735;
2736; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2737; GFX10-CU:       ; %bb.0: ; %entry
2738; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2739; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2740; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2741; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2742; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2743; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2744; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2745; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2746; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2747; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2748; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2749; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2750; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2751; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2752; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2753; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2754; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2755; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2756; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2757; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2758; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2759; GFX10-CU-NEXT:    s_endpgm
2760;
2761; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2762; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2763; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2764; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2765; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2766; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2767; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2768; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2769; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2770; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2771; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
2772; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
2773; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
2774; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
2775; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2776; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
2777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2779; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
2781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2783; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2784; SKIP-CACHE-INV-NEXT:    s_endpgm
2785;
2786; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2787; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2788; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2789; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2790; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2791; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2792; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2793; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2794; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2795; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2796; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2797; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2798; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2799;
2800; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2801; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2802; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2803; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2804; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2805; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2807; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2808; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2809; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2810; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2811; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2812; GFX90A-TGSPLIT-NEXT:    s_endpgm
2813;
2814; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2815; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2816; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2817; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2818; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2819; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2820; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2821; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2822; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2823; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2824; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2825; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2826; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2827;
2828; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2829; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2830; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2831; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2832; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2833; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2834; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2835; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2836; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2837; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2838; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2839; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2840; GFX940-TGSPLIT-NEXT:    s_endpgm
2841;
2842; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2843; GFX11-WGP:       ; %bb.0: ; %entry
2844; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2845; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2846; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2847; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2848; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
2849; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
2850; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2851; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
2852; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2853; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2854; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2855; GFX11-WGP-NEXT:    s_endpgm
2856;
2857; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2858; GFX11-CU:       ; %bb.0: ; %entry
2859; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2860; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2861; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2862; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
2864; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
2865; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2866; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
2867; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2868; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2869; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2870; GFX11-CU-NEXT:    s_endpgm
2871;
2872; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2873; GFX12-WGP:       ; %bb.0: ; %entry
2874; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2875; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2876; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2877; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2878; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
2879; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
2880; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2881; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
2882; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2883; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2884; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
2885; GFX12-WGP-NEXT:    s_endpgm
2886;
2887; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2888; GFX12-CU:       ; %bb.0: ; %entry
2889; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2890; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2891; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2892; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2893; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
2894; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
2895; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2896; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
2897; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2898; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2899; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2900; GFX12-CU-NEXT:    s_endpgm
2901    ptr %out, i32 %in, i32 %old) {
2902entry:
2903  %gep = getelementptr i32, ptr %out, i32 4
2904  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
2905  ret void
2906}
2907
2908define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
2909; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2910; GFX7:       ; %bb.0: ; %entry
2911; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2912; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2913; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2914; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2915; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2917; GFX7-NEXT:    s_mov_b32 s4, s8
2918; GFX7-NEXT:    s_mov_b32 s5, s9
2919; GFX7-NEXT:    s_mov_b32 s9, s10
2920; GFX7-NEXT:    s_mov_b32 s8, s11
2921; GFX7-NEXT:    s_add_u32 s4, s4, s9
2922; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2923; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2924; GFX7-NEXT:    s_mov_b32 s5, s8
2925; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2926; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2927; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2928; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2929; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2930; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2931; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2932; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2933; GFX7-NEXT:    s_endpgm
2934;
2935; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2936; GFX10-WGP:       ; %bb.0: ; %entry
2937; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2938; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2939; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2940; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2941; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2942; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2943; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2944; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2945; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2946; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2947; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2948; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2949; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2950; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2951; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2952; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2953; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2954; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2955; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2956; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2957; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2958; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2959; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2960; GFX10-WGP-NEXT:    buffer_gl0_inv
2961; GFX10-WGP-NEXT:    s_endpgm
2962;
2963; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2964; GFX10-CU:       ; %bb.0: ; %entry
2965; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2966; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2967; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2968; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2969; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2970; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2971; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2972; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2973; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2974; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2975; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2976; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2977; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2978; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2979; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2981; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2982; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2984; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2985; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2986; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX10-CU-NEXT:    s_endpgm
2988;
2989; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2990; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2991; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2992; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2993; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2994; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2995; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2996; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2998; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2999; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3000; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3001; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3002; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3003; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3006; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3007; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3009; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3010; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3011; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3012; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3013; SKIP-CACHE-INV-NEXT:    s_endpgm
3014;
3015; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3016; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3017; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3018; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3019; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3020; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3021; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3022; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3023; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3024; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3025; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3026; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3028; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3029;
3030; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3031; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3032; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3033; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3034; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3035; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3036; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3037; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3038; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3039; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3040; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3041; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3042; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3043; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3044; GFX90A-TGSPLIT-NEXT:    s_endpgm
3045;
3046; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3047; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3048; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3049; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3050; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3051; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3052; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3053; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3054; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3055; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3056; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3057; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3058; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3059; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3060;
3061; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3062; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3063; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3064; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3065; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3066; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3067; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3068; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3069; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3070; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3071; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3072; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3073; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3074; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
3075; GFX940-TGSPLIT-NEXT:    s_endpgm
3076;
3077; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3078; GFX11-WGP:       ; %bb.0: ; %entry
3079; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3080; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3081; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3082; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3083; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3084; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3085; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3086; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3087; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3088; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3089; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3090; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3091; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3092; GFX11-WGP-NEXT:    buffer_gl0_inv
3093; GFX11-WGP-NEXT:    s_endpgm
3094;
3095; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3096; GFX11-CU:       ; %bb.0: ; %entry
3097; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3098; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3099; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3100; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3101; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3102; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3103; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3104; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3105; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3106; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3107; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3108; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3109; GFX11-CU-NEXT:    s_endpgm
3110;
3111; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3112; GFX12-WGP:       ; %bb.0: ; %entry
3113; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3114; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3115; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3116; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3117; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3118; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3119; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3120; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3121; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3122; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3123; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3124; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
3125; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
3126; GFX12-WGP-NEXT:    s_endpgm
3127;
3128; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3129; GFX12-CU:       ; %bb.0: ; %entry
3130; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3131; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3132; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3133; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3134; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3135; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3136; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3137; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3138; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3139; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3140; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3141; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3142; GFX12-CU-NEXT:    s_endpgm
3143    ptr %out, i32 %in, i32 %old) {
3144entry:
3145  %gep = getelementptr i32, ptr %out, i32 4
3146  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
3147  ret void
3148}
3149
3150define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
3151; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3152; GFX7:       ; %bb.0: ; %entry
3153; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3154; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3155; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3156; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3157; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3158; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3159; GFX7-NEXT:    s_mov_b32 s4, s8
3160; GFX7-NEXT:    s_mov_b32 s5, s9
3161; GFX7-NEXT:    s_mov_b32 s9, s10
3162; GFX7-NEXT:    s_mov_b32 s8, s11
3163; GFX7-NEXT:    s_add_u32 s4, s4, s9
3164; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3165; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3166; GFX7-NEXT:    s_mov_b32 s5, s8
3167; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3168; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3169; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3170; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3171; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3172; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3173; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3174; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3175; GFX7-NEXT:    s_endpgm
3176;
3177; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3178; GFX10-WGP:       ; %bb.0: ; %entry
3179; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3180; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3181; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3182; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3183; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3184; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3185; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3186; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3187; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3188; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3189; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3190; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3191; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3192; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3193; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3194; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3195; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3196; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3197; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3199; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3200; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3201; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3202; GFX10-WGP-NEXT:    s_endpgm
3203;
3204; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3205; GFX10-CU:       ; %bb.0: ; %entry
3206; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3207; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3208; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3209; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3210; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3211; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3212; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3213; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3214; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3215; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3216; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3217; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3218; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3219; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3220; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3221; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3222; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3223; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3224; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3225; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3226; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3227; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3228; GFX10-CU-NEXT:    s_endpgm
3229;
3230; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3231; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3232; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3233; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3234; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3235; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3236; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3237; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3238; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3239; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3240; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3241; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3242; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3243; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3244; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3245; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3246; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3247; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3248; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3252; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3253; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3254; SKIP-CACHE-INV-NEXT:    s_endpgm
3255;
3256; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3257; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3258; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3259; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3260; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3261; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3262; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3263; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3264; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3265; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3266; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3267; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3268; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3269; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3270;
3271; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3272; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3273; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3274; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3275; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3276; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3277; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3278; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3279; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3280; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3281; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3282; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3283; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3284; GFX90A-TGSPLIT-NEXT:    s_endpgm
3285;
3286; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3287; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3288; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3289; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3290; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3291; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3292; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3293; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3294; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3295; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3296; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3297; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3298; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3299; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3300;
3301; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3302; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3303; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3304; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3305; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3306; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3307; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3308; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3309; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3310; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3311; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3312; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3313; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3314; GFX940-TGSPLIT-NEXT:    s_endpgm
3315;
3316; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3317; GFX11-WGP:       ; %bb.0: ; %entry
3318; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3319; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3320; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3321; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3322; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3323; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3324; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3325; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3326; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3327; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3328; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3329; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3330; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3331; GFX11-WGP-NEXT:    s_endpgm
3332;
3333; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3334; GFX11-CU:       ; %bb.0: ; %entry
3335; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3336; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3337; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3338; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3339; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3340; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3341; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3342; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3343; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3344; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3345; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3346; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3347; GFX11-CU-NEXT:    s_endpgm
3348;
3349; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3350; GFX12-WGP:       ; %bb.0: ; %entry
3351; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3352; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3353; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3354; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3355; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3356; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3357; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3358; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3359; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3360; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3361; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
3362; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
3363; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
3364; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
3365; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3366; GFX12-WGP-NEXT:    s_endpgm
3367;
3368; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3369; GFX12-CU:       ; %bb.0: ; %entry
3370; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3371; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3372; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3373; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3374; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3375; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3376; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3377; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3378; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3379; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3380; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3381; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3382; GFX12-CU-NEXT:    s_endpgm
3383    ptr %out, i32 %in, i32 %old) {
3384entry:
3385  %gep = getelementptr i32, ptr %out, i32 4
3386  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
3387  ret void
3388}
3389
3390define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
3391; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3392; GFX7:       ; %bb.0: ; %entry
3393; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3394; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3395; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3396; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3397; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3399; GFX7-NEXT:    s_mov_b32 s4, s8
3400; GFX7-NEXT:    s_mov_b32 s5, s9
3401; GFX7-NEXT:    s_mov_b32 s9, s10
3402; GFX7-NEXT:    s_mov_b32 s8, s11
3403; GFX7-NEXT:    s_add_u32 s4, s4, s9
3404; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3405; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3406; GFX7-NEXT:    s_mov_b32 s5, s8
3407; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3408; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3409; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3410; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3411; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3412; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3413; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3414; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3415; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3416; GFX7-NEXT:    s_endpgm
3417;
3418; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3419; GFX10-WGP:       ; %bb.0: ; %entry
3420; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3421; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3422; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3423; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3424; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3425; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3426; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3427; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3428; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3429; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3430; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3431; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3432; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3433; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3434; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3435; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3436; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3437; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3438; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3439; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3440; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3441; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3442; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3443; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3444; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3445; GFX10-WGP-NEXT:    buffer_gl0_inv
3446; GFX10-WGP-NEXT:    s_endpgm
3447;
3448; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3449; GFX10-CU:       ; %bb.0: ; %entry
3450; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3451; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3452; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3453; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3454; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3455; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3456; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3457; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3458; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3459; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3460; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3461; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3462; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3463; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3464; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3465; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3466; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3467; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3468; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3469; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3470; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3471; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3472; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3473; GFX10-CU-NEXT:    s_endpgm
3474;
3475; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3476; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3477; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3478; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3479; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3480; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3481; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3482; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3484; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3485; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3486; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3487; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3488; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3489; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3490; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3493; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3497; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3498; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3499; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3500; SKIP-CACHE-INV-NEXT:    s_endpgm
3501;
3502; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3503; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3505; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3506; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3507; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3509; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3510; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3511; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3512; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3513; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3514; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3515; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3516; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3517;
3518; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3519; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3520; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3521; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3522; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3523; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3524; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3525; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3526; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3527; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3528; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3529; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3530; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3531; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3532; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3533; GFX90A-TGSPLIT-NEXT:    s_endpgm
3534;
3535; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3536; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3537; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3538; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3539; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3540; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3541; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3542; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3543; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3544; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3545; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3546; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3547; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3548; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3549; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3550;
3551; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3552; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3553; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3554; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3555; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3556; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3557; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3558; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3559; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3560; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3561; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3562; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3563; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3564; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3565; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
3566; GFX940-TGSPLIT-NEXT:    s_endpgm
3567;
3568; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3569; GFX11-WGP:       ; %bb.0: ; %entry
3570; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3571; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3572; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3573; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3574; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3575; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3576; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3577; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3578; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3579; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3580; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3581; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3582; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3583; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3585; GFX11-WGP-NEXT:    buffer_gl0_inv
3586; GFX11-WGP-NEXT:    s_endpgm
3587;
3588; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3589; GFX11-CU:       ; %bb.0: ; %entry
3590; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3591; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3592; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3593; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3594; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3595; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3596; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3597; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3598; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3599; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3600; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3601; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3602; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3603; GFX11-CU-NEXT:    s_endpgm
3604;
3605; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3606; GFX12-WGP:       ; %bb.0: ; %entry
3607; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3608; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3609; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3610; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3611; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3612; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3613; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3614; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3615; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3616; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3617; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
3618; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
3619; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
3620; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
3621; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3622; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
3623; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
3624; GFX12-WGP-NEXT:    s_endpgm
3625;
3626; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3627; GFX12-CU:       ; %bb.0: ; %entry
3628; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3629; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3630; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3631; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3632; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3633; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3634; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3635; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3636; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3637; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3638; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3639; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3640; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3641; GFX12-CU-NEXT:    s_endpgm
3642    ptr %out, i32 %in, i32 %old) {
3643entry:
3644  %gep = getelementptr i32, ptr %out, i32 4
3645  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
3646  ret void
3647}
3648
3649define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
3650; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3651; GFX7:       ; %bb.0: ; %entry
3652; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3653; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3654; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3655; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3656; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3657; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3658; GFX7-NEXT:    s_mov_b32 s4, s8
3659; GFX7-NEXT:    s_mov_b32 s5, s9
3660; GFX7-NEXT:    s_mov_b32 s9, s10
3661; GFX7-NEXT:    s_mov_b32 s8, s11
3662; GFX7-NEXT:    s_add_u32 s4, s4, s9
3663; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3664; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3665; GFX7-NEXT:    s_mov_b32 s5, s8
3666; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3667; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3668; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3669; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3670; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3671; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3672; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3673; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3674; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3675; GFX7-NEXT:    s_endpgm
3676;
3677; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3678; GFX10-WGP:       ; %bb.0: ; %entry
3679; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3680; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3681; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3682; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3683; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3684; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3685; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3686; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3687; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3688; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3689; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3690; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3691; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3692; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3693; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3694; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3695; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3696; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3697; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3698; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3699; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3700; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3701; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3702; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3703; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3704; GFX10-WGP-NEXT:    buffer_gl0_inv
3705; GFX10-WGP-NEXT:    s_endpgm
3706;
3707; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3708; GFX10-CU:       ; %bb.0: ; %entry
3709; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3710; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3711; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3712; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3713; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3714; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3715; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3716; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3717; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3718; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3719; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3720; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3721; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3722; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3723; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3724; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3725; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3726; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3727; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3728; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3729; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3730; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3731; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3732; GFX10-CU-NEXT:    s_endpgm
3733;
3734; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3735; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3736; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3737; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3738; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3739; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3740; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3741; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3742; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3743; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3744; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3745; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3746; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3747; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3748; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3750; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3751; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3752; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3754; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3756; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3757; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3758; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3759; SKIP-CACHE-INV-NEXT:    s_endpgm
3760;
3761; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3762; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3763; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3764; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3765; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3766; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3767; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3768; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3769; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3770; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3771; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3772; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3773; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3774; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3775; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3776;
3777; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3778; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3779; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3780; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3781; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3782; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3783; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3784; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3785; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3786; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3787; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3788; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3789; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3790; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3791; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3792; GFX90A-TGSPLIT-NEXT:    s_endpgm
3793;
3794; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3795; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3796; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3797; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3798; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3799; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3800; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3802; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3803; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3804; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3805; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3807; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3808; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3809;
3810; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3811; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3812; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3813; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3814; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3815; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3816; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3817; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3818; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3820; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3821; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3822; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3823; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3824; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
3825; GFX940-TGSPLIT-NEXT:    s_endpgm
3826;
3827; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3828; GFX11-WGP:       ; %bb.0: ; %entry
3829; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3830; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3831; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3832; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3833; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3834; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3835; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3836; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3837; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3838; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3839; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3840; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3841; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3842; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3843; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3844; GFX11-WGP-NEXT:    buffer_gl0_inv
3845; GFX11-WGP-NEXT:    s_endpgm
3846;
3847; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3848; GFX11-CU:       ; %bb.0: ; %entry
3849; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3850; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3851; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3852; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3853; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3854; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3855; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3856; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3857; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3858; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3859; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3860; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3861; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3862; GFX11-CU-NEXT:    s_endpgm
3863;
3864; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3865; GFX12-WGP:       ; %bb.0: ; %entry
3866; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3867; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3868; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3869; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3870; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3871; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3872; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3873; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3874; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3875; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3876; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
3877; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
3878; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
3879; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
3880; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3881; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
3882; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
3883; GFX12-WGP-NEXT:    s_endpgm
3884;
3885; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3886; GFX12-CU:       ; %bb.0: ; %entry
3887; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3888; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3889; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3890; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3891; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3892; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3893; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3894; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3895; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3896; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3897; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3898; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3899; GFX12-CU-NEXT:    s_wait_dscnt 0x0
3900; GFX12-CU-NEXT:    s_endpgm
3901    ptr %out, i32 %in, i32 %old) {
3902entry:
3903  %gep = getelementptr i32, ptr %out, i32 4
3904  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
3905  ret void
3906}
3907
3908define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
3909; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3910; GFX7:       ; %bb.0: ; %entry
3911; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3912; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3913; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3914; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3915; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3917; GFX7-NEXT:    s_mov_b32 s4, s8
3918; GFX7-NEXT:    s_mov_b32 s5, s9
3919; GFX7-NEXT:    s_mov_b32 s9, s10
3920; GFX7-NEXT:    s_mov_b32 s8, s11
3921; GFX7-NEXT:    s_add_u32 s4, s4, s9
3922; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3923; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3924; GFX7-NEXT:    s_mov_b32 s5, s8
3925; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3926; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3927; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3928; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3929; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3930; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3931; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3932; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3933; GFX7-NEXT:    s_endpgm
3934;
3935; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3936; GFX10-WGP:       ; %bb.0: ; %entry
3937; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3938; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3939; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3940; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3941; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3942; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3943; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3944; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3945; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3946; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3947; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3948; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3949; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3950; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3951; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3952; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3953; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3954; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3955; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3956; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3957; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3958; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3959; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3960; GFX10-WGP-NEXT:    buffer_gl0_inv
3961; GFX10-WGP-NEXT:    s_endpgm
3962;
3963; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3964; GFX10-CU:       ; %bb.0: ; %entry
3965; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3966; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3967; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3968; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3969; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3970; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3971; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3972; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3973; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3974; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3975; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3976; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3977; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3978; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3979; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3980; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3981; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3982; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3984; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3985; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3986; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3987; GFX10-CU-NEXT:    s_endpgm
3988;
3989; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3990; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3991; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3992; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3993; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3994; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3995; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3996; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3998; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3999; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4000; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4001; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4002; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4003; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4006; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4007; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4009; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4010; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4011; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4012; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4013; SKIP-CACHE-INV-NEXT:    s_endpgm
4014;
4015; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4016; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4017; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4018; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4019; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4020; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4021; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4022; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4023; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4024; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4025; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4026; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4028; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4029;
4030; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4031; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4032; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4033; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4034; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4035; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4036; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4037; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4038; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4039; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4040; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4041; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4042; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4043; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4044; GFX90A-TGSPLIT-NEXT:    s_endpgm
4045;
4046; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4047; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4048; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4049; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4050; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4051; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4052; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4053; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4054; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4055; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4056; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4057; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4058; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4059; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4060;
4061; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4062; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4063; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4064; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4065; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4066; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4067; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4068; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4069; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4070; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4071; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4072; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4073; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4074; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
4075; GFX940-TGSPLIT-NEXT:    s_endpgm
4076;
4077; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4078; GFX11-WGP:       ; %bb.0: ; %entry
4079; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4080; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4081; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4082; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4083; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4084; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4085; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4086; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4087; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4088; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4089; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4090; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4091; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4092; GFX11-WGP-NEXT:    buffer_gl0_inv
4093; GFX11-WGP-NEXT:    s_endpgm
4094;
4095; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4096; GFX11-CU:       ; %bb.0: ; %entry
4097; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4098; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4099; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4100; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4101; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4102; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4103; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4104; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4105; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4106; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4107; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4108; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4109; GFX11-CU-NEXT:    s_endpgm
4110;
4111; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4112; GFX12-WGP:       ; %bb.0: ; %entry
4113; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4114; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4115; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4116; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4117; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4118; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4119; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4120; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4121; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4122; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4123; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4124; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4125; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
4126; GFX12-WGP-NEXT:    s_endpgm
4127;
4128; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4129; GFX12-CU:       ; %bb.0: ; %entry
4130; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4131; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4132; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4133; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4134; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4135; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4136; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4137; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4138; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4139; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4140; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4141; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4142; GFX12-CU-NEXT:    s_endpgm
4143    ptr %out, i32 %in, i32 %old) {
4144entry:
4145  %gep = getelementptr i32, ptr %out, i32 4
4146  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
4147  ret void
4148}
4149
4150define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
4151; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4152; GFX7:       ; %bb.0: ; %entry
4153; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4154; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4155; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4156; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4157; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4158; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4159; GFX7-NEXT:    s_mov_b32 s4, s8
4160; GFX7-NEXT:    s_mov_b32 s5, s9
4161; GFX7-NEXT:    s_mov_b32 s9, s10
4162; GFX7-NEXT:    s_mov_b32 s8, s11
4163; GFX7-NEXT:    s_add_u32 s4, s4, s9
4164; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4165; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4166; GFX7-NEXT:    s_mov_b32 s5, s8
4167; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4168; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4169; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4170; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4171; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4172; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4173; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4175; GFX7-NEXT:    s_endpgm
4176;
4177; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4178; GFX10-WGP:       ; %bb.0: ; %entry
4179; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4180; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4181; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4182; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4183; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4184; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4185; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4186; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4187; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4188; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4189; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4190; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4191; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4192; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4193; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4194; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4195; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4196; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4197; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4198; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4199; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4200; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4201; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4202; GFX10-WGP-NEXT:    buffer_gl0_inv
4203; GFX10-WGP-NEXT:    s_endpgm
4204;
4205; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4206; GFX10-CU:       ; %bb.0: ; %entry
4207; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4208; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4209; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4210; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4211; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4212; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4213; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4214; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4215; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4216; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4217; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4218; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4219; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4220; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4221; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4222; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4223; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4224; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4225; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4226; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4227; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4228; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4229; GFX10-CU-NEXT:    s_endpgm
4230;
4231; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4232; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4233; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4234; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4235; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4236; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4237; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4238; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4239; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4240; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4241; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4242; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4243; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4244; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4245; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4246; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4247; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4248; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4249; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4252; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4253; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4254; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4255; SKIP-CACHE-INV-NEXT:    s_endpgm
4256;
4257; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4258; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4259; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4260; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4262; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4263; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4264; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4265; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4266; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4267; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4268; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4269; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4270; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4271;
4272; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4273; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4275; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4276; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4277; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4278; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4279; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4280; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4281; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4282; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4283; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4284; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4285; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4286; GFX90A-TGSPLIT-NEXT:    s_endpgm
4287;
4288; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4289; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4290; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4291; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4292; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4293; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4294; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4295; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4296; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4297; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4298; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4299; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4300; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4301; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4302;
4303; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4304; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4305; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4306; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4307; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4308; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4309; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4310; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4311; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4312; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4313; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4314; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4315; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4316; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
4317; GFX940-TGSPLIT-NEXT:    s_endpgm
4318;
4319; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4320; GFX11-WGP:       ; %bb.0: ; %entry
4321; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4322; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4323; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4324; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4325; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4326; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4327; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4328; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4329; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4330; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4331; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4332; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4334; GFX11-WGP-NEXT:    buffer_gl0_inv
4335; GFX11-WGP-NEXT:    s_endpgm
4336;
4337; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4338; GFX11-CU:       ; %bb.0: ; %entry
4339; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4340; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4341; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4342; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4343; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4344; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4345; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4346; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4347; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4348; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4349; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4350; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4351; GFX11-CU-NEXT:    s_endpgm
4352;
4353; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4354; GFX12-WGP:       ; %bb.0: ; %entry
4355; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4356; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4357; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4358; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4359; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4360; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4361; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4362; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4363; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4364; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4365; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4366; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4367; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
4368; GFX12-WGP-NEXT:    s_endpgm
4369;
4370; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4371; GFX12-CU:       ; %bb.0: ; %entry
4372; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4373; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4374; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4375; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4376; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4377; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4378; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4379; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4380; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4381; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4382; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4383; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4384; GFX12-CU-NEXT:    s_endpgm
4385    ptr %out, i32 %in, i32 %old) {
4386entry:
4387  %gep = getelementptr i32, ptr %out, i32 4
4388  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
4389  ret void
4390}
4391
4392define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
4393; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
4394; GFX7:       ; %bb.0: ; %entry
4395; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4396; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4397; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4398; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4399; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4400; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4401; GFX7-NEXT:    s_mov_b32 s4, s8
4402; GFX7-NEXT:    s_mov_b32 s5, s9
4403; GFX7-NEXT:    s_mov_b32 s9, s10
4404; GFX7-NEXT:    s_mov_b32 s8, s11
4405; GFX7-NEXT:    s_add_u32 s4, s4, s9
4406; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4407; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4408; GFX7-NEXT:    s_mov_b32 s5, s8
4409; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4410; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4411; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4412; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4413; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4414; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4415; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4416; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4417; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4418; GFX7-NEXT:    s_endpgm
4419;
4420; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4421; GFX10-WGP:       ; %bb.0: ; %entry
4422; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4423; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4424; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4425; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4426; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4427; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4428; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4429; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4430; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4431; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4432; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4433; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4434; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4435; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4436; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4437; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4438; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4439; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4440; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4441; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4442; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4443; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4444; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4445; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4446; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4447; GFX10-WGP-NEXT:    buffer_gl0_inv
4448; GFX10-WGP-NEXT:    s_endpgm
4449;
4450; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4451; GFX10-CU:       ; %bb.0: ; %entry
4452; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4453; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4454; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4455; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4456; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4457; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4458; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4459; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4460; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4461; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4462; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4463; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4464; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4465; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4466; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4467; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4468; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4469; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4470; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4471; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4472; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4473; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4474; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4475; GFX10-CU-NEXT:    s_endpgm
4476;
4477; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
4478; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4479; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4480; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4481; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4482; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4483; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4484; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4485; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4486; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4487; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4488; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4489; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4490; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4491; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4492; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4493; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4495; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4498; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4499; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4500; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4501; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4502; SKIP-CACHE-INV-NEXT:    s_endpgm
4503;
4504; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4505; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4506; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4507; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4508; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4509; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4510; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4511; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4512; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4513; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4514; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4515; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4516; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4517; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4518; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4519;
4520; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4521; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4522; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4523; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4524; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4525; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4526; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4527; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4528; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4529; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4530; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4531; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4532; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4533; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4534; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4535; GFX90A-TGSPLIT-NEXT:    s_endpgm
4536;
4537; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4538; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4539; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4540; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4541; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4542; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4543; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4544; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4545; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4546; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4547; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4548; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4549; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4550; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4551; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4552;
4553; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4554; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4555; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4556; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4557; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4558; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4559; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4560; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4561; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4562; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4563; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4564; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4565; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4566; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4567; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
4568; GFX940-TGSPLIT-NEXT:    s_endpgm
4569;
4570; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4571; GFX11-WGP:       ; %bb.0: ; %entry
4572; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4573; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4574; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4575; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4576; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4577; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4578; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4579; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4580; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4581; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4582; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4583; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4584; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4585; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4586; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4587; GFX11-WGP-NEXT:    buffer_gl0_inv
4588; GFX11-WGP-NEXT:    s_endpgm
4589;
4590; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4591; GFX11-CU:       ; %bb.0: ; %entry
4592; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4593; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4594; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4595; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4596; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4597; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4598; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4599; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4600; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4601; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4602; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4603; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4604; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4605; GFX11-CU-NEXT:    s_endpgm
4606;
4607; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4608; GFX12-WGP:       ; %bb.0: ; %entry
4609; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4610; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4611; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4612; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4613; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4614; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4615; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4616; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4617; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4618; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4619; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
4620; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
4621; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
4622; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
4623; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4624; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4625; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
4626; GFX12-WGP-NEXT:    s_endpgm
4627;
4628; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4629; GFX12-CU:       ; %bb.0: ; %entry
4630; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4631; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4632; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4633; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4634; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4635; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4636; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4637; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4638; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4639; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4640; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4641; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4642; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4643; GFX12-CU-NEXT:    s_endpgm
4644    ptr %out, i32 %in, i32 %old) {
4645entry:
4646  %gep = getelementptr i32, ptr %out, i32 4
4647  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
4648  ret void
4649}
4650
4651define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
4652; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4653; GFX7:       ; %bb.0: ; %entry
4654; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4655; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4656; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4657; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4658; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4659; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4660; GFX7-NEXT:    s_mov_b32 s4, s8
4661; GFX7-NEXT:    s_mov_b32 s5, s9
4662; GFX7-NEXT:    s_mov_b32 s9, s10
4663; GFX7-NEXT:    s_mov_b32 s8, s11
4664; GFX7-NEXT:    s_add_u32 s4, s4, s9
4665; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4666; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4667; GFX7-NEXT:    s_mov_b32 s5, s8
4668; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4669; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4670; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4671; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4672; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4673; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4674; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4675; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4676; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4677; GFX7-NEXT:    s_endpgm
4678;
4679; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4680; GFX10-WGP:       ; %bb.0: ; %entry
4681; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4682; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4683; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4684; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4685; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4686; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4687; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4688; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4689; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4690; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4691; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4692; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4693; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4694; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4695; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4696; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4697; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4698; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4699; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4700; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4701; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4702; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4703; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4704; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4705; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4706; GFX10-WGP-NEXT:    buffer_gl0_inv
4707; GFX10-WGP-NEXT:    s_endpgm
4708;
4709; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4710; GFX10-CU:       ; %bb.0: ; %entry
4711; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4712; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4713; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4714; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4715; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4716; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4717; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4718; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4719; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4720; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4721; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4722; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4723; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4724; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4725; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4726; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4727; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4728; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4729; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4730; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4731; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4732; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4733; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4734; GFX10-CU-NEXT:    s_endpgm
4735;
4736; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4737; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4738; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4739; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4740; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4741; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4742; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4743; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4744; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4745; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4746; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4747; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4748; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4749; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4750; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4754; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4756; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4757; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4758; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4759; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4760; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4761; SKIP-CACHE-INV-NEXT:    s_endpgm
4762;
4763; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4764; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4765; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4766; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4767; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4769; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4770; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4771; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4772; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4773; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4774; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4775; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4776; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4777; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4778;
4779; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4780; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4781; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4782; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4783; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4784; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4785; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4786; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4787; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4788; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4789; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4790; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4791; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4792; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4793; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4794; GFX90A-TGSPLIT-NEXT:    s_endpgm
4795;
4796; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4797; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4798; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4799; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4800; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4801; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4802; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4803; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4804; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4805; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4806; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4807; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4808; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4809; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4810; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4811;
4812; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4813; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4814; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4815; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4816; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4817; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4820; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4821; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4822; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4823; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4824; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4825; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4826; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
4827; GFX940-TGSPLIT-NEXT:    s_endpgm
4828;
4829; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4830; GFX11-WGP:       ; %bb.0: ; %entry
4831; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4832; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4833; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4834; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4835; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4836; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4837; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4838; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4839; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4840; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4841; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4842; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4843; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4844; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4845; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4846; GFX11-WGP-NEXT:    buffer_gl0_inv
4847; GFX11-WGP-NEXT:    s_endpgm
4848;
4849; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4850; GFX11-CU:       ; %bb.0: ; %entry
4851; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4852; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4853; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4854; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4855; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4856; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4857; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4858; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4859; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4860; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4861; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4862; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4863; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4864; GFX11-CU-NEXT:    s_endpgm
4865;
4866; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4867; GFX12-WGP:       ; %bb.0: ; %entry
4868; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4869; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4870; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4871; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4872; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4873; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4874; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4875; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4876; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4877; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4878; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
4879; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
4880; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
4881; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
4882; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4883; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4884; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
4885; GFX12-WGP-NEXT:    s_endpgm
4886;
4887; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4888; GFX12-CU:       ; %bb.0: ; %entry
4889; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4890; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4891; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4892; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4893; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4894; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4895; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4896; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4897; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4898; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4899; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4900; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4901; GFX12-CU-NEXT:    s_wait_dscnt 0x0
4902; GFX12-CU-NEXT:    s_endpgm
4903    ptr %out, i32 %in, i32 %old) {
4904entry:
4905  %gep = getelementptr i32, ptr %out, i32 4
4906  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
4907  ret void
4908}
4909
4910define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
4911; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4912; GFX7:       ; %bb.0: ; %entry
4913; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4914; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4915; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4916; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4917; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4918; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4919; GFX7-NEXT:    s_mov_b32 s4, s8
4920; GFX7-NEXT:    s_mov_b32 s5, s9
4921; GFX7-NEXT:    s_mov_b32 s9, s10
4922; GFX7-NEXT:    s_mov_b32 s8, s11
4923; GFX7-NEXT:    s_add_u32 s4, s4, s9
4924; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4925; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4926; GFX7-NEXT:    s_mov_b32 s5, s8
4927; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4928; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4929; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4930; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4931; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4932; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4933; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4934; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4935; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4936; GFX7-NEXT:    s_endpgm
4937;
4938; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4939; GFX10-WGP:       ; %bb.0: ; %entry
4940; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4941; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4942; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4943; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4944; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4945; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4946; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4947; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4948; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4949; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4950; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4951; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4952; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4953; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4954; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4955; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4956; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4957; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4958; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4959; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4960; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4961; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4962; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4963; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4964; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4965; GFX10-WGP-NEXT:    buffer_gl0_inv
4966; GFX10-WGP-NEXT:    s_endpgm
4967;
4968; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4969; GFX10-CU:       ; %bb.0: ; %entry
4970; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4971; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4972; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4973; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4974; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4975; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4976; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4977; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4978; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4979; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4980; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4981; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4982; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4983; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4984; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4985; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4986; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4987; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4988; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4989; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4990; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4991; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4992; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4993; GFX10-CU-NEXT:    s_endpgm
4994;
4995; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4996; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4997; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4998; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4999; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5000; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5001; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5002; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5003; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5005; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5006; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5007; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5008; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5009; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5010; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5011; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5012; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5013; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5014; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5015; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5016; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5017; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5018; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5019; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5020; SKIP-CACHE-INV-NEXT:    s_endpgm
5021;
5022; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5023; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5024; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5025; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5026; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5028; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5029; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5030; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5031; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5032; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5033; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5034; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5035; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5036; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5037;
5038; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5039; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5040; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5041; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5042; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5043; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5044; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5045; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5046; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5047; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5048; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5049; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5050; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5051; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5052; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5053; GFX90A-TGSPLIT-NEXT:    s_endpgm
5054;
5055; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5056; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5057; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5058; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5059; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5060; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5061; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5062; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5063; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5064; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5065; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5066; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5067; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5068; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5069; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5070;
5071; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5072; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5073; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5074; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5075; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5076; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5077; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5078; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5079; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5080; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5081; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5082; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5083; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5084; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5085; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
5086; GFX940-TGSPLIT-NEXT:    s_endpgm
5087;
5088; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5089; GFX11-WGP:       ; %bb.0: ; %entry
5090; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5091; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5092; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5093; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5094; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5095; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5096; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5097; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5098; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5099; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5100; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5101; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5102; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5103; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5104; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5105; GFX11-WGP-NEXT:    buffer_gl0_inv
5106; GFX11-WGP-NEXT:    s_endpgm
5107;
5108; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5109; GFX11-CU:       ; %bb.0: ; %entry
5110; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5111; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5112; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5113; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5114; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5115; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5116; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5117; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5118; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5119; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5120; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5121; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5122; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5123; GFX11-CU-NEXT:    s_endpgm
5124;
5125; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5126; GFX12-WGP:       ; %bb.0: ; %entry
5127; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5128; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5129; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5130; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5131; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5132; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5133; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5134; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5135; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5136; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5137; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
5138; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
5139; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
5140; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5141; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
5142; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
5143; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
5144; GFX12-WGP-NEXT:    s_endpgm
5145;
5146; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5147; GFX12-CU:       ; %bb.0: ; %entry
5148; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5149; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5150; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5151; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5152; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5153; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5154; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5155; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5156; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5157; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5158; GFX12-CU-NEXT:    s_wait_dscnt 0x0
5159; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5160; GFX12-CU-NEXT:    s_wait_dscnt 0x0
5161; GFX12-CU-NEXT:    s_endpgm
5162    ptr %out, i32 %in, i32 %old) {
5163entry:
5164  %gep = getelementptr i32, ptr %out, i32 4
5165  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
5166  ret void
5167}
5168
5169define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
5170; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5171; GFX7:       ; %bb.0: ; %entry
5172; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5173; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5174; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5175; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5176; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5178; GFX7-NEXT:    s_mov_b32 s4, s8
5179; GFX7-NEXT:    s_mov_b32 s5, s9
5180; GFX7-NEXT:    s_mov_b32 s9, s10
5181; GFX7-NEXT:    s_mov_b32 s8, s11
5182; GFX7-NEXT:    s_add_u32 s4, s4, s9
5183; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5184; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5185; GFX7-NEXT:    s_mov_b32 s5, s8
5186; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5187; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5188; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5189; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5190; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5191; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5192; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5193; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5194; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5195; GFX7-NEXT:    s_endpgm
5196;
5197; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5198; GFX10-WGP:       ; %bb.0: ; %entry
5199; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5200; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5201; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5202; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5203; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5204; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5205; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5206; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5207; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5208; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5209; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5210; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5211; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5212; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5213; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5214; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5215; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5216; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5219; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5220; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5221; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5222; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5223; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5224; GFX10-WGP-NEXT:    buffer_gl0_inv
5225; GFX10-WGP-NEXT:    s_endpgm
5226;
5227; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5228; GFX10-CU:       ; %bb.0: ; %entry
5229; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5230; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5231; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5232; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5233; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5234; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5235; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5236; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5237; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5238; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5239; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5240; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5241; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5242; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5243; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5244; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5245; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5246; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5247; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5248; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5249; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5250; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5251; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5252; GFX10-CU-NEXT:    s_endpgm
5253;
5254; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5255; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5256; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5257; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5258; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5259; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5260; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5261; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5262; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5263; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5264; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5265; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5266; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5267; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5268; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5269; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5270; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5271; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5272; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5273; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5274; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5275; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5276; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5277; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5278; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5279; SKIP-CACHE-INV-NEXT:    s_endpgm
5280;
5281; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5282; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5283; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5284; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5285; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5286; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5287; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5288; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5289; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5290; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5291; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5292; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5293; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5294; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5295; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5296;
5297; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5298; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5299; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5300; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5301; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5302; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5303; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5304; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5305; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5306; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5307; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5308; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5309; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5310; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5311; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5312; GFX90A-TGSPLIT-NEXT:    s_endpgm
5313;
5314; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5315; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5316; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5317; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5318; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5319; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5320; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5321; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5322; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5323; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5324; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5325; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5326; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5327; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5328; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5329;
5330; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5331; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5332; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5333; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5334; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5335; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5336; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5337; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5338; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5339; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5340; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5341; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5342; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5343; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5344; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
5345; GFX940-TGSPLIT-NEXT:    s_endpgm
5346;
5347; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5348; GFX11-WGP:       ; %bb.0: ; %entry
5349; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5350; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5351; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5352; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5353; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5354; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5355; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5356; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5357; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5358; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5359; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5360; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5361; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5362; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5363; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5364; GFX11-WGP-NEXT:    buffer_gl0_inv
5365; GFX11-WGP-NEXT:    s_endpgm
5366;
5367; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5368; GFX11-CU:       ; %bb.0: ; %entry
5369; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5370; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5371; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5372; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5373; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5374; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5375; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5376; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5377; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5378; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5379; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5380; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5381; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5382; GFX11-CU-NEXT:    s_endpgm
5383;
5384; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5385; GFX12-WGP:       ; %bb.0: ; %entry
5386; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5387; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5388; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5389; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5390; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5391; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5392; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5393; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5394; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5395; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5396; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
5397; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
5398; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
5399; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5400; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
5401; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
5402; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
5403; GFX12-WGP-NEXT:    s_endpgm
5404;
5405; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5406; GFX12-CU:       ; %bb.0: ; %entry
5407; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5408; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5409; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5410; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5411; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5412; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5413; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5414; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5415; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5416; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5417; GFX12-CU-NEXT:    s_wait_dscnt 0x0
5418; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5419; GFX12-CU-NEXT:    s_wait_dscnt 0x0
5420; GFX12-CU-NEXT:    s_endpgm
5421    ptr %out, i32 %in, i32 %old) {
5422entry:
5423  %gep = getelementptr i32, ptr %out, i32 4
5424  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
5425  ret void
5426}
5427
5428define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
5429; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5430; GFX7:       ; %bb.0: ; %entry
5431; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
5432; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5433; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
5434; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
5435; GFX7-NEXT:    s_mov_b64 s[12:13], 16
5436; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5437; GFX7-NEXT:    s_mov_b32 s6, s4
5438; GFX7-NEXT:    s_mov_b32 s7, s5
5439; GFX7-NEXT:    s_mov_b32 s11, s12
5440; GFX7-NEXT:    s_mov_b32 s10, s13
5441; GFX7-NEXT:    s_add_u32 s6, s6, s11
5442; GFX7-NEXT:    s_addc_u32 s10, s7, s10
5443; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5444; GFX7-NEXT:    s_mov_b32 s7, s10
5445; GFX7-NEXT:    v_mov_b32_e32 v2, s9
5446; GFX7-NEXT:    v_mov_b32_e32 v0, s8
5447; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5448; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5449; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5450; GFX7-NEXT:    v_mov_b32_e32 v1, s7
5451; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5452; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5453; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5454; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5455; GFX7-NEXT:    flat_store_dword v[0:1], v2
5456; GFX7-NEXT:    s_endpgm
5457;
5458; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5459; GFX10-WGP:       ; %bb.0: ; %entry
5460; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
5461; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5462; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
5463; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
5464; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
5465; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5466; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
5467; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
5468; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
5469; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
5470; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
5471; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
5472; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5473; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
5474; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
5475; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
5476; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5477; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5478; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5479; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5480; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5481; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5482; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5483; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5484; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5485; GFX10-WGP-NEXT:    s_endpgm
5486;
5487; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5488; GFX10-CU:       ; %bb.0: ; %entry
5489; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
5490; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5491; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
5492; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
5493; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
5494; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5495; GFX10-CU-NEXT:    s_mov_b32 s6, s4
5496; GFX10-CU-NEXT:    s_mov_b32 s7, s5
5497; GFX10-CU-NEXT:    s_mov_b32 s11, s12
5498; GFX10-CU-NEXT:    s_mov_b32 s10, s13
5499; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
5500; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
5501; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5502; GFX10-CU-NEXT:    s_mov_b32 s7, s10
5503; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
5504; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
5505; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5506; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5507; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5508; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5509; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5510; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5511; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5512; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5513; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5514; GFX10-CU-NEXT:    s_endpgm
5515;
5516; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5517; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5518; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5519; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5520; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5521; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5522; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
5523; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5524; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
5525; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
5526; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
5527; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
5528; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
5529; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
5530; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
5531; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5532; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
5533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5534; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5537; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5538; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5541; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5542; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5543; SKIP-CACHE-INV-NEXT:    s_endpgm
5544;
5545; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5546; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5547; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5548; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5549; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5550; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5551; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5552; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5553; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5554; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5555; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5556; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5557; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5558; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5559; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5560; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5561;
5562; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5563; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5564; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5565; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5566; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5567; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5568; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5569; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5570; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5571; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5572; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5573; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5574; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5575; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5576; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5577; GFX90A-TGSPLIT-NEXT:    s_endpgm
5578;
5579; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5580; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5581; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5582; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5583; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5584; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5585; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5586; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5587; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5588; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5589; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5590; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5591; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5592; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5593; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5594; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5595;
5596; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5597; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5598; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5599; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5600; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5601; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5602; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5603; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5604; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5605; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5606; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5607; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5608; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5609; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5610; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5611; GFX940-TGSPLIT-NEXT:    s_endpgm
5612;
5613; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5614; GFX11-WGP:       ; %bb.0: ; %entry
5615; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5616; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5617; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5618; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5619; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5620; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5621; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5622; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5623; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5624; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5625; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5626; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5627; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5628; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5629; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5630; GFX11-WGP-NEXT:    s_endpgm
5631;
5632; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5633; GFX11-CU:       ; %bb.0: ; %entry
5634; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5635; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5636; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5637; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5638; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5639; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5640; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5641; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5642; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5643; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5644; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5645; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5646; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5647; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5648; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5649; GFX11-CU-NEXT:    s_endpgm
5650;
5651; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5652; GFX12-WGP:       ; %bb.0: ; %entry
5653; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5654; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5655; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5656; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5657; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5658; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5659; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5660; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5661; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5662; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5663; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
5664; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5665; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5666; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5667; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
5668; GFX12-WGP-NEXT:    s_endpgm
5669;
5670; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5671; GFX12-CU:       ; %bb.0: ; %entry
5672; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5673; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5674; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5675; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5676; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5677; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5678; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5679; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5680; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5681; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5682; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
5683; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5684; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5685; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
5686; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
5687; GFX12-CU-NEXT:    s_endpgm
5688    ptr %out, i32 %in, i32 %old) {
5689entry:
5690  %gep = getelementptr i32, ptr %out, i32 4
5691  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
5692  %val0 = extractvalue { i32, i1 } %val, 0
5693  store i32 %val0, ptr %out, align 4
5694  ret void
5695}
5696
5697define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
5698; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5699; GFX7:       ; %bb.0: ; %entry
5700; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
5701; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5702; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
5703; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
5704; GFX7-NEXT:    s_mov_b64 s[12:13], 16
5705; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5706; GFX7-NEXT:    s_mov_b32 s6, s4
5707; GFX7-NEXT:    s_mov_b32 s7, s5
5708; GFX7-NEXT:    s_mov_b32 s11, s12
5709; GFX7-NEXT:    s_mov_b32 s10, s13
5710; GFX7-NEXT:    s_add_u32 s6, s6, s11
5711; GFX7-NEXT:    s_addc_u32 s10, s7, s10
5712; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5713; GFX7-NEXT:    s_mov_b32 s7, s10
5714; GFX7-NEXT:    v_mov_b32_e32 v2, s9
5715; GFX7-NEXT:    v_mov_b32_e32 v0, s8
5716; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5717; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5718; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5719; GFX7-NEXT:    v_mov_b32_e32 v1, s7
5720; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5722; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5723; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5724; GFX7-NEXT:    s_waitcnt vmcnt(0)
5725; GFX7-NEXT:    flat_store_dword v[0:1], v2
5726; GFX7-NEXT:    s_endpgm
5727;
5728; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5729; GFX10-WGP:       ; %bb.0: ; %entry
5730; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
5731; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5732; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
5733; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
5734; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
5735; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5736; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
5737; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
5738; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
5739; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
5740; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
5741; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
5742; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5743; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
5744; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
5745; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
5746; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5747; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5749; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5750; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5751; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5752; GFX10-WGP-NEXT:    buffer_gl0_inv
5753; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5754; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5755; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5756; GFX10-WGP-NEXT:    s_endpgm
5757;
5758; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5759; GFX10-CU:       ; %bb.0: ; %entry
5760; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
5761; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5762; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
5763; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
5764; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
5765; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5766; GFX10-CU-NEXT:    s_mov_b32 s6, s4
5767; GFX10-CU-NEXT:    s_mov_b32 s7, s5
5768; GFX10-CU-NEXT:    s_mov_b32 s11, s12
5769; GFX10-CU-NEXT:    s_mov_b32 s10, s13
5770; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
5771; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
5772; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5773; GFX10-CU-NEXT:    s_mov_b32 s7, s10
5774; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
5775; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
5776; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5777; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5778; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5779; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5780; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5781; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5782; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5783; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5784; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5785; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5786; GFX10-CU-NEXT:    s_endpgm
5787;
5788; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5789; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5790; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5791; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5792; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5793; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5794; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
5795; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5796; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
5797; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
5798; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
5799; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
5800; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
5801; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
5802; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
5803; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
5805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5806; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5809; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5810; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5811; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5814; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5815; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5816; SKIP-CACHE-INV-NEXT:    s_endpgm
5817;
5818; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5819; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5820; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5821; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5822; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5823; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5824; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5825; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5826; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5827; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5828; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5829; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5830; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5831; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5832; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5833; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5834; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5835;
5836; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5837; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5838; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5839; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5840; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5841; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5842; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5843; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5844; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5845; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5846; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5847; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5848; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5849; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5850; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5851; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5852; GFX90A-TGSPLIT-NEXT:    s_endpgm
5853;
5854; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5855; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5856; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5857; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5858; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5859; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5860; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5861; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5862; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5863; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5864; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5865; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5866; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5867; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5868; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5869; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5870; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5871;
5872; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5873; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5874; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5875; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5876; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5877; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5878; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5879; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5880; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5881; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5882; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5883; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5884; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5885; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
5886; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5887; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5888; GFX940-TGSPLIT-NEXT:    s_endpgm
5889;
5890; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5891; GFX11-WGP:       ; %bb.0: ; %entry
5892; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5893; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5894; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5895; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5896; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5897; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5898; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5899; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5900; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5901; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5902; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5903; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5904; GFX11-WGP-NEXT:    buffer_gl0_inv
5905; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5906; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5907; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5908; GFX11-WGP-NEXT:    s_endpgm
5909;
5910; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5911; GFX11-CU:       ; %bb.0: ; %entry
5912; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5913; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5914; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5915; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5916; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5917; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5918; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5919; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5920; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5921; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5922; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5923; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5924; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5925; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5926; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
5927; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5928; GFX11-CU-NEXT:    s_endpgm
5929;
5930; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5931; GFX12-WGP:       ; %bb.0: ; %entry
5932; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5933; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5934; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5935; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5936; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5937; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5938; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5939; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5940; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5941; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5942; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
5943; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5944; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
5945; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5946; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5947; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
5948; GFX12-WGP-NEXT:    s_endpgm
5949;
5950; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5951; GFX12-CU:       ; %bb.0: ; %entry
5952; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5953; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5954; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5955; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5956; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5957; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5958; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5959; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5960; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5961; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5962; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
5963; GFX12-CU-NEXT:    s_wait_dscnt 0x0
5964; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5965; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5966; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
5967; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
5968; GFX12-CU-NEXT:    s_endpgm
5969    ptr %out, i32 %in, i32 %old) {
5970entry:
5971  %gep = getelementptr i32, ptr %out, i32 4
5972  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
5973  %val0 = extractvalue { i32, i1 } %val, 0
5974  store i32 %val0, ptr %out, align 4
5975  ret void
5976}
5977
5978define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
5979; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
5980; GFX7:       ; %bb.0: ; %entry
5981; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
5982; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5983; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
5984; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
5985; GFX7-NEXT:    s_mov_b64 s[12:13], 16
5986; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5987; GFX7-NEXT:    s_mov_b32 s6, s4
5988; GFX7-NEXT:    s_mov_b32 s7, s5
5989; GFX7-NEXT:    s_mov_b32 s11, s12
5990; GFX7-NEXT:    s_mov_b32 s10, s13
5991; GFX7-NEXT:    s_add_u32 s6, s6, s11
5992; GFX7-NEXT:    s_addc_u32 s10, s7, s10
5993; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5994; GFX7-NEXT:    s_mov_b32 s7, s10
5995; GFX7-NEXT:    v_mov_b32_e32 v2, s9
5996; GFX7-NEXT:    v_mov_b32_e32 v0, s8
5997; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5998; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5999; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6000; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6002; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6003; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6004; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6005; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6006; GFX7-NEXT:    flat_store_dword v[0:1], v2
6007; GFX7-NEXT:    s_endpgm
6008;
6009; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6010; GFX10-WGP:       ; %bb.0: ; %entry
6011; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6012; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6013; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6014; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6015; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6016; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6017; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6018; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6019; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6020; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6021; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6022; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6023; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6024; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6025; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6026; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6027; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6028; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6029; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6030; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6031; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6032; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6033; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6034; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6035; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6036; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6037; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6038; GFX10-WGP-NEXT:    s_endpgm
6039;
6040; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6041; GFX10-CU:       ; %bb.0: ; %entry
6042; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6043; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6044; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6045; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6046; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6047; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6048; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6049; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6050; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6051; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6052; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6053; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6054; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6055; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6056; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6057; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6058; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6059; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6060; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6061; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6062; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6063; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6064; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6065; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6066; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6067; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6068; GFX10-CU-NEXT:    s_endpgm
6069;
6070; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6071; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6072; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6073; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6074; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6075; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6076; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6077; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6078; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6079; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6080; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6081; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6082; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6083; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6084; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6085; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6086; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6088; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6092; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6093; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6095; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6096; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6097; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6098; SKIP-CACHE-INV-NEXT:    s_endpgm
6099;
6100; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6101; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6103; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6104; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6105; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6106; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6107; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6108; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6109; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6110; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6111; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6112; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6113; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6114; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6115; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6116; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6117;
6118; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6119; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6120; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6121; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6122; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6123; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6124; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6125; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6126; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6127; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6128; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6129; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6130; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6131; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6132; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6133; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6134; GFX90A-TGSPLIT-NEXT:    s_endpgm
6135;
6136; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6137; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6138; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6139; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6140; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6141; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6142; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6143; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6144; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6145; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6146; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6147; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6148; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6149; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6150; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6151; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6152; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6153;
6154; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6155; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6156; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6157; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6158; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6159; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6160; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6161; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6162; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6163; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6164; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6165; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6166; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6167; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6168; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6169; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6170; GFX940-TGSPLIT-NEXT:    s_endpgm
6171;
6172; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6173; GFX11-WGP:       ; %bb.0: ; %entry
6174; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6175; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6176; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6177; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6178; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6179; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6180; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6181; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6182; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6183; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6184; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6185; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6186; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6187; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6188; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6189; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6190; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6191; GFX11-WGP-NEXT:    s_endpgm
6192;
6193; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6194; GFX11-CU:       ; %bb.0: ; %entry
6195; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6196; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6197; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6198; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6199; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6200; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6201; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6202; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6203; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6204; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6205; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6206; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6207; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6208; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6209; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6210; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6211; GFX11-CU-NEXT:    s_endpgm
6212;
6213; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6214; GFX12-WGP:       ; %bb.0: ; %entry
6215; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6216; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6217; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6218; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6219; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6220; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6221; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6222; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6223; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6224; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6225; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6226; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6227; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6228; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6229; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6230; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6231; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6232; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6233; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6234; GFX12-WGP-NEXT:    s_endpgm
6235;
6236; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6237; GFX12-CU:       ; %bb.0: ; %entry
6238; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6239; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6240; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6241; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6242; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6243; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6244; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6245; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6246; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6247; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6248; GFX12-CU-NEXT:    s_wait_dscnt 0x0
6249; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6250; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6251; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6252; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6253; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6254; GFX12-CU-NEXT:    s_endpgm
6255    ptr %out, i32 %in, i32 %old) {
6256entry:
6257  %gep = getelementptr i32, ptr %out, i32 4
6258  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
6259  %val0 = extractvalue { i32, i1 } %val, 0
6260  store i32 %val0, ptr %out, align 4
6261  ret void
6262}
6263
6264define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
6265; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6266; GFX7:       ; %bb.0: ; %entry
6267; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6268; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6269; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6270; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6271; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6272; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6273; GFX7-NEXT:    s_mov_b32 s6, s4
6274; GFX7-NEXT:    s_mov_b32 s7, s5
6275; GFX7-NEXT:    s_mov_b32 s11, s12
6276; GFX7-NEXT:    s_mov_b32 s10, s13
6277; GFX7-NEXT:    s_add_u32 s6, s6, s11
6278; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6279; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6280; GFX7-NEXT:    s_mov_b32 s7, s10
6281; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6282; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6283; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6284; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6285; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6286; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6287; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6288; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6289; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6290; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6291; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6292; GFX7-NEXT:    s_waitcnt vmcnt(0)
6293; GFX7-NEXT:    flat_store_dword v[0:1], v2
6294; GFX7-NEXT:    s_endpgm
6295;
6296; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6297; GFX10-WGP:       ; %bb.0: ; %entry
6298; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6299; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6300; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6301; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6302; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6303; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6304; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6305; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6306; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6307; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6308; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6309; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6310; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6311; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6312; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6313; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6314; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6315; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6316; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6317; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6318; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6319; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6320; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6321; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6322; GFX10-WGP-NEXT:    buffer_gl0_inv
6323; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6324; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6325; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6326; GFX10-WGP-NEXT:    s_endpgm
6327;
6328; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6329; GFX10-CU:       ; %bb.0: ; %entry
6330; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6331; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6332; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6333; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6334; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6335; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6336; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6337; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6338; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6339; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6340; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6341; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6342; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6343; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6344; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6345; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6346; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6347; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6348; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6349; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6350; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6351; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6352; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6353; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6354; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6355; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6356; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6357; GFX10-CU-NEXT:    s_endpgm
6358;
6359; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6360; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6361; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6362; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6363; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6364; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6365; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6366; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6368; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6369; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6370; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6371; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6372; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6373; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6374; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6377; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6378; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6379; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6380; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6381; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6382; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6383; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6384; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6385; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6386; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6387; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6388; SKIP-CACHE-INV-NEXT:    s_endpgm
6389;
6390; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6391; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6392; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6393; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6394; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6395; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6396; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6397; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6398; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6399; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6400; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6401; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6402; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6403; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6404; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6405; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6406; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6407; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6408;
6409; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6410; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6411; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6412; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6413; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6414; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6415; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6416; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6417; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6418; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6419; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6420; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6421; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6422; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6423; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6424; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6425; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6426; GFX90A-TGSPLIT-NEXT:    s_endpgm
6427;
6428; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6429; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6430; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6431; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6432; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6433; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6434; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6435; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6436; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6437; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6438; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6439; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6440; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6441; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6442; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6443; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6444; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6445; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6446;
6447; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6448; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6449; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6450; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6451; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6452; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6453; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6454; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6455; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6456; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6457; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6458; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6459; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6460; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6461; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
6462; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6463; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6464; GFX940-TGSPLIT-NEXT:    s_endpgm
6465;
6466; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6467; GFX11-WGP:       ; %bb.0: ; %entry
6468; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6469; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6470; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6471; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6472; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6473; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6474; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6475; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6476; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6477; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6478; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6479; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6480; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6481; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6482; GFX11-WGP-NEXT:    buffer_gl0_inv
6483; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6484; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6485; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6486; GFX11-WGP-NEXT:    s_endpgm
6487;
6488; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6489; GFX11-CU:       ; %bb.0: ; %entry
6490; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6491; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6492; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6493; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6494; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6495; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6496; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6497; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6498; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6499; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6500; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6501; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6502; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6503; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6504; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6505; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6506; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6507; GFX11-CU-NEXT:    s_endpgm
6508;
6509; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6510; GFX12-WGP:       ; %bb.0: ; %entry
6511; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6512; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6513; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6514; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6515; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6516; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6517; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6518; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6519; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6520; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6521; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6522; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6523; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6524; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6525; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6526; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6527; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6528; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6529; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
6530; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6531; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6532; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6533; GFX12-WGP-NEXT:    s_endpgm
6534;
6535; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6536; GFX12-CU:       ; %bb.0: ; %entry
6537; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6538; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6539; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6540; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6541; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6542; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6543; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6544; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6545; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6546; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6547; GFX12-CU-NEXT:    s_wait_dscnt 0x0
6548; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6549; GFX12-CU-NEXT:    s_wait_dscnt 0x0
6550; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6551; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6552; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6553; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6554; GFX12-CU-NEXT:    s_endpgm
6555    ptr %out, i32 %in, i32 %old) {
6556entry:
6557  %gep = getelementptr i32, ptr %out, i32 4
6558  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
6559  %val0 = extractvalue { i32, i1 } %val, 0
6560  store i32 %val0, ptr %out, align 4
6561  ret void
6562}
6563
6564define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
6565; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6566; GFX7:       ; %bb.0: ; %entry
6567; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6568; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6569; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6570; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6571; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6572; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6573; GFX7-NEXT:    s_mov_b32 s6, s4
6574; GFX7-NEXT:    s_mov_b32 s7, s5
6575; GFX7-NEXT:    s_mov_b32 s11, s12
6576; GFX7-NEXT:    s_mov_b32 s10, s13
6577; GFX7-NEXT:    s_add_u32 s6, s6, s11
6578; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6579; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6580; GFX7-NEXT:    s_mov_b32 s7, s10
6581; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6582; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6583; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6584; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6585; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6586; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6587; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6588; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6589; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6590; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6591; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6592; GFX7-NEXT:    s_waitcnt vmcnt(0)
6593; GFX7-NEXT:    flat_store_dword v[0:1], v2
6594; GFX7-NEXT:    s_endpgm
6595;
6596; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6597; GFX10-WGP:       ; %bb.0: ; %entry
6598; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6599; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6600; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6601; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6602; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6603; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6604; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6605; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6606; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6607; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6608; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6609; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6610; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6611; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6612; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6613; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6614; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6615; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6616; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6617; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6618; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6619; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6620; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6621; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6622; GFX10-WGP-NEXT:    buffer_gl0_inv
6623; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6624; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6625; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6626; GFX10-WGP-NEXT:    s_endpgm
6627;
6628; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6629; GFX10-CU:       ; %bb.0: ; %entry
6630; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6631; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6632; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6633; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6634; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6635; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6636; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6637; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6638; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6639; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6640; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6641; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6642; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6643; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6644; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6645; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6646; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6647; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6648; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6649; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6650; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6651; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6652; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6653; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6654; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6655; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6656; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6657; GFX10-CU-NEXT:    s_endpgm
6658;
6659; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6660; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6661; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6662; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6663; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6664; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6665; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6666; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6667; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6668; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6669; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6670; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6671; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6672; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6673; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6674; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6675; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6676; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6677; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6679; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6681; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6682; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6683; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6686; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6687; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6688; SKIP-CACHE-INV-NEXT:    s_endpgm
6689;
6690; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6691; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6692; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6693; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6694; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6695; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6696; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6697; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6698; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6699; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6700; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6701; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6702; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6703; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6704; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6705; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6706; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6707; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6708;
6709; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6710; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6711; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6712; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6713; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6714; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6715; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6716; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6717; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6718; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6719; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6720; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6721; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6722; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6723; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6724; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6725; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6726; GFX90A-TGSPLIT-NEXT:    s_endpgm
6727;
6728; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6729; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6730; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6731; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6732; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6733; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6734; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6735; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6736; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6737; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6738; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6739; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6740; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6741; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6742; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6743; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6744; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6745; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6746;
6747; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6748; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6749; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6750; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6751; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6752; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6753; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6754; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6755; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6756; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6757; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6758; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6759; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6760; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6761; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
6762; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6763; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6764; GFX940-TGSPLIT-NEXT:    s_endpgm
6765;
6766; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6767; GFX11-WGP:       ; %bb.0: ; %entry
6768; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6769; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6770; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6771; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6772; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6773; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6774; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6775; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6776; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6777; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6778; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6779; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6780; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6781; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6782; GFX11-WGP-NEXT:    buffer_gl0_inv
6783; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6784; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6785; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6786; GFX11-WGP-NEXT:    s_endpgm
6787;
6788; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6789; GFX11-CU:       ; %bb.0: ; %entry
6790; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6791; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6792; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6793; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6794; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6795; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6796; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6797; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6798; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6799; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6800; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6801; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6802; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6803; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6804; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6805; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6806; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6807; GFX11-CU-NEXT:    s_endpgm
6808;
6809; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6810; GFX12-WGP:       ; %bb.0: ; %entry
6811; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6812; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6813; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6814; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6815; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6816; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6817; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6818; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6819; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6820; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6821; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6822; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6823; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6824; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6825; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6826; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6827; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6828; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6829; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
6830; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6831; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6832; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6833; GFX12-WGP-NEXT:    s_endpgm
6834;
6835; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6836; GFX12-CU:       ; %bb.0: ; %entry
6837; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6838; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6839; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6840; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6841; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6842; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6843; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6844; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6845; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6846; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6847; GFX12-CU-NEXT:    s_wait_dscnt 0x0
6848; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6849; GFX12-CU-NEXT:    s_wait_dscnt 0x0
6850; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6851; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6852; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6853; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6854; GFX12-CU-NEXT:    s_endpgm
6855    ptr %out, i32 %in, i32 %old) {
6856entry:
6857  %gep = getelementptr i32, ptr %out, i32 4
6858  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
6859  %val0 = extractvalue { i32, i1 } %val, 0
6860  store i32 %val0, ptr %out, align 4
6861  ret void
6862}
6863
6864define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
6865; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6866; GFX7:       ; %bb.0: ; %entry
6867; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6868; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6869; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6870; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6871; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6872; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6873; GFX7-NEXT:    s_mov_b32 s6, s4
6874; GFX7-NEXT:    s_mov_b32 s7, s5
6875; GFX7-NEXT:    s_mov_b32 s11, s12
6876; GFX7-NEXT:    s_mov_b32 s10, s13
6877; GFX7-NEXT:    s_add_u32 s6, s6, s11
6878; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6879; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6880; GFX7-NEXT:    s_mov_b32 s7, s10
6881; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6882; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6883; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6884; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6885; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6886; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6887; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6888; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6889; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6890; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6891; GFX7-NEXT:    s_waitcnt vmcnt(0)
6892; GFX7-NEXT:    flat_store_dword v[0:1], v2
6893; GFX7-NEXT:    s_endpgm
6894;
6895; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6896; GFX10-WGP:       ; %bb.0: ; %entry
6897; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6898; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6899; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6900; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6901; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6902; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6903; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6904; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6905; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6906; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6907; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6908; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6909; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6910; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6911; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6912; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6913; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6914; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6915; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6916; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6917; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6918; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6919; GFX10-WGP-NEXT:    buffer_gl0_inv
6920; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6921; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6922; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6923; GFX10-WGP-NEXT:    s_endpgm
6924;
6925; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6926; GFX10-CU:       ; %bb.0: ; %entry
6927; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6928; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6929; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6930; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6931; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6932; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6933; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6934; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6935; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6936; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6937; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6938; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6939; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6940; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6941; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6942; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6943; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6944; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6945; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6946; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6947; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6948; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6949; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6950; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6951; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6952; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6953; GFX10-CU-NEXT:    s_endpgm
6954;
6955; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6956; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6957; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6959; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6960; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6961; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6962; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6963; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6964; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6965; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6966; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6967; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6968; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6969; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6970; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6972; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6973; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6974; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6975; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6976; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6977; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6978; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6979; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6980; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6981; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6982; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6983; SKIP-CACHE-INV-NEXT:    s_endpgm
6984;
6985; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6986; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6987; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6988; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6989; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6990; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6991; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6992; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6993; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6994; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6995; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6996; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6997; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6998; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6999; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7000; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7001; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7002;
7003; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7004; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7005; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7006; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7007; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7008; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7009; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7010; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7011; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7012; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7013; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7014; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7015; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7016; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7017; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7018; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7019; GFX90A-TGSPLIT-NEXT:    s_endpgm
7020;
7021; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7022; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7023; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7024; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7025; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7026; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7027; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7028; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7029; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7030; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7031; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7032; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7033; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7034; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7035; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7036; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7037; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7038;
7039; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7040; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7041; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7042; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7043; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7044; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7045; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7046; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7047; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7048; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7049; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7050; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7051; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7052; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
7053; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7054; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7055; GFX940-TGSPLIT-NEXT:    s_endpgm
7056;
7057; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7058; GFX11-WGP:       ; %bb.0: ; %entry
7059; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7060; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7061; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7062; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7063; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7064; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7065; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7066; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7067; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7068; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7069; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7070; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7071; GFX11-WGP-NEXT:    buffer_gl0_inv
7072; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7073; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7074; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7075; GFX11-WGP-NEXT:    s_endpgm
7076;
7077; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7078; GFX11-CU:       ; %bb.0: ; %entry
7079; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7080; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7081; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7082; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7083; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7084; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7085; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7086; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7087; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7088; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7089; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7090; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7091; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7092; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7093; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7094; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7095; GFX11-CU-NEXT:    s_endpgm
7096;
7097; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7098; GFX12-WGP:       ; %bb.0: ; %entry
7099; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7100; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7101; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7102; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7103; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7104; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7105; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7106; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7107; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7108; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7109; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7110; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7111; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7112; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7113; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
7114; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7115; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7116; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7117; GFX12-WGP-NEXT:    s_endpgm
7118;
7119; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7120; GFX12-CU:       ; %bb.0: ; %entry
7121; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7122; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7123; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7124; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7125; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7126; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7127; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7128; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7129; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7130; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7131; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7132; GFX12-CU-NEXT:    s_wait_dscnt 0x0
7133; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7134; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7135; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7136; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7137; GFX12-CU-NEXT:    s_endpgm
7138    ptr %out, i32 %in, i32 %old) {
7139entry:
7140  %gep = getelementptr i32, ptr %out, i32 4
7141  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
7142  %val0 = extractvalue { i32, i1 } %val, 0
7143  store i32 %val0, ptr %out, align 4
7144  ret void
7145}
7146
7147define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
7148; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7149; GFX7:       ; %bb.0: ; %entry
7150; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7151; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7152; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7153; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7154; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7155; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7156; GFX7-NEXT:    s_mov_b32 s6, s4
7157; GFX7-NEXT:    s_mov_b32 s7, s5
7158; GFX7-NEXT:    s_mov_b32 s11, s12
7159; GFX7-NEXT:    s_mov_b32 s10, s13
7160; GFX7-NEXT:    s_add_u32 s6, s6, s11
7161; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7162; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7163; GFX7-NEXT:    s_mov_b32 s7, s10
7164; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7165; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7166; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7167; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7168; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7169; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7170; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7171; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7172; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7173; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7174; GFX7-NEXT:    s_waitcnt vmcnt(0)
7175; GFX7-NEXT:    flat_store_dword v[0:1], v2
7176; GFX7-NEXT:    s_endpgm
7177;
7178; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7179; GFX10-WGP:       ; %bb.0: ; %entry
7180; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7181; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7182; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7183; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7184; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7185; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7186; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7187; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7188; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7189; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7190; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7191; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7192; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7193; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7194; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7195; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7196; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7197; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7198; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7199; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7200; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7201; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7202; GFX10-WGP-NEXT:    buffer_gl0_inv
7203; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7204; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7205; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7206; GFX10-WGP-NEXT:    s_endpgm
7207;
7208; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7209; GFX10-CU:       ; %bb.0: ; %entry
7210; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7211; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7212; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7213; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7214; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7215; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7216; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7217; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7218; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7219; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7220; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7221; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7222; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7223; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7224; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7225; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7226; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7227; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7228; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7229; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7230; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7231; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7232; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7233; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7234; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7235; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7236; GFX10-CU-NEXT:    s_endpgm
7237;
7238; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7239; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7240; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7241; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7242; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7243; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7244; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7245; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7246; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7247; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7248; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7249; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7250; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7251; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7252; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7253; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7255; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7256; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7257; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7258; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7259; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7260; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7261; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7262; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7263; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7264; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7265; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7266; SKIP-CACHE-INV-NEXT:    s_endpgm
7267;
7268; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7269; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7270; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7271; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7272; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7273; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7274; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7275; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7276; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7277; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7278; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7279; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7280; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7281; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7282; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7283; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7284; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7285;
7286; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7287; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7288; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7289; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7290; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7291; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7292; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7293; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7294; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7295; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7296; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7297; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7298; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7299; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7300; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7301; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7302; GFX90A-TGSPLIT-NEXT:    s_endpgm
7303;
7304; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7305; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7306; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7307; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7308; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7309; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7310; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7311; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7312; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7313; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7314; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7315; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7316; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7317; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7318; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7319; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7320; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7321;
7322; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7323; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7324; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7325; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7326; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7327; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7328; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7329; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7330; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7331; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7332; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7333; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7334; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7335; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
7336; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7337; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7338; GFX940-TGSPLIT-NEXT:    s_endpgm
7339;
7340; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7341; GFX11-WGP:       ; %bb.0: ; %entry
7342; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7343; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7344; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7345; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7346; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7347; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7348; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7349; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7350; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7351; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7352; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7353; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7354; GFX11-WGP-NEXT:    buffer_gl0_inv
7355; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7356; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7357; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7358; GFX11-WGP-NEXT:    s_endpgm
7359;
7360; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7361; GFX11-CU:       ; %bb.0: ; %entry
7362; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7363; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7364; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7365; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7366; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7367; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7368; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7369; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7370; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7371; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7372; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7373; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7374; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7375; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7376; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7377; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7378; GFX11-CU-NEXT:    s_endpgm
7379;
7380; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7381; GFX12-WGP:       ; %bb.0: ; %entry
7382; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7383; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7384; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7385; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7386; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7387; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7388; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7389; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7390; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7391; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7392; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7393; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7394; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
7395; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7396; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7397; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7398; GFX12-WGP-NEXT:    s_endpgm
7399;
7400; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7401; GFX12-CU:       ; %bb.0: ; %entry
7402; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7403; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7404; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7405; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7406; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7407; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7408; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7409; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7410; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7411; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7412; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7413; GFX12-CU-NEXT:    s_wait_dscnt 0x0
7414; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7415; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7416; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7417; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7418; GFX12-CU-NEXT:    s_endpgm
7419    ptr %out, i32 %in, i32 %old) {
7420entry:
7421  %gep = getelementptr i32, ptr %out, i32 4
7422  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
7423  %val0 = extractvalue { i32, i1 } %val, 0
7424  store i32 %val0, ptr %out, align 4
7425  ret void
7426}
7427
7428define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
7429; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7430; GFX7:       ; %bb.0: ; %entry
7431; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7432; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7433; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7434; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7435; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7436; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7437; GFX7-NEXT:    s_mov_b32 s6, s4
7438; GFX7-NEXT:    s_mov_b32 s7, s5
7439; GFX7-NEXT:    s_mov_b32 s11, s12
7440; GFX7-NEXT:    s_mov_b32 s10, s13
7441; GFX7-NEXT:    s_add_u32 s6, s6, s11
7442; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7443; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7444; GFX7-NEXT:    s_mov_b32 s7, s10
7445; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7446; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7447; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7448; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7449; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7450; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7451; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7452; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7454; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7455; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7456; GFX7-NEXT:    s_waitcnt vmcnt(0)
7457; GFX7-NEXT:    flat_store_dword v[0:1], v2
7458; GFX7-NEXT:    s_endpgm
7459;
7460; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7461; GFX10-WGP:       ; %bb.0: ; %entry
7462; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7463; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7464; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7465; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7466; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7467; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7468; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7469; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7470; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7471; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7472; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7473; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7474; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7475; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7476; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7477; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7478; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7479; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7480; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7481; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7482; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7483; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7484; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7485; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7486; GFX10-WGP-NEXT:    buffer_gl0_inv
7487; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7488; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7489; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7490; GFX10-WGP-NEXT:    s_endpgm
7491;
7492; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7493; GFX10-CU:       ; %bb.0: ; %entry
7494; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7495; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7496; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7497; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7498; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7499; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7500; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7501; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7502; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7503; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7504; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7505; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7506; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7507; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7508; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7509; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7510; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7511; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7512; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7513; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7514; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7515; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7516; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7517; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7518; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7519; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7520; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7521; GFX10-CU-NEXT:    s_endpgm
7522;
7523; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7524; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7525; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7526; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7527; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7528; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7529; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7530; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7531; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7532; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7533; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7534; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7535; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7536; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7537; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7541; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7543; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7544; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7545; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7546; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7547; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7549; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7550; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7551; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7552; SKIP-CACHE-INV-NEXT:    s_endpgm
7553;
7554; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7555; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7556; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7557; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7558; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7559; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7560; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7561; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7562; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7563; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7564; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7565; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7566; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7567; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7568; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7569; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7570; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7571; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7572;
7573; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7574; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7575; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7576; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7577; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7578; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7579; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7580; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7581; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7582; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7583; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7584; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7585; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7586; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7587; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7588; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7589; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7590; GFX90A-TGSPLIT-NEXT:    s_endpgm
7591;
7592; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7593; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7594; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7595; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7596; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7597; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7598; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7599; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7600; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7601; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7602; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7603; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7604; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7605; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7606; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7607; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7608; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7609; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7610;
7611; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7612; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7613; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7614; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7615; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7616; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7617; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7618; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7619; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7620; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7621; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7622; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7623; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7624; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7625; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
7626; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7627; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7628; GFX940-TGSPLIT-NEXT:    s_endpgm
7629;
7630; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7631; GFX11-WGP:       ; %bb.0: ; %entry
7632; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7633; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7634; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7635; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7636; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7637; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7638; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7639; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7640; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7641; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7642; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7643; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7644; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7645; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7646; GFX11-WGP-NEXT:    buffer_gl0_inv
7647; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7648; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7649; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7650; GFX11-WGP-NEXT:    s_endpgm
7651;
7652; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7653; GFX11-CU:       ; %bb.0: ; %entry
7654; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7655; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7656; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7657; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7658; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7659; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7660; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7661; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7662; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7663; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7664; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7665; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7666; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7667; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7668; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7669; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7670; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7671; GFX11-CU-NEXT:    s_endpgm
7672;
7673; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7674; GFX12-WGP:       ; %bb.0: ; %entry
7675; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7676; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7677; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7678; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7679; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7680; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7681; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7682; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7683; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7684; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7685; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7686; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7687; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
7688; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7689; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7690; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7691; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7692; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7693; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
7694; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7695; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7696; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7697; GFX12-WGP-NEXT:    s_endpgm
7698;
7699; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7700; GFX12-CU:       ; %bb.0: ; %entry
7701; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7702; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7703; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7704; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7705; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7706; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7707; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7708; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7709; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7710; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7711; GFX12-CU-NEXT:    s_wait_dscnt 0x0
7712; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7713; GFX12-CU-NEXT:    s_wait_dscnt 0x0
7714; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7715; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7716; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7717; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7718; GFX12-CU-NEXT:    s_endpgm
7719    ptr %out, i32 %in, i32 %old) {
7720entry:
7721  %gep = getelementptr i32, ptr %out, i32 4
7722  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
7723  %val0 = extractvalue { i32, i1 } %val, 0
7724  store i32 %val0, ptr %out, align 4
7725  ret void
7726}
7727
7728define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
7729; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7730; GFX7:       ; %bb.0: ; %entry
7731; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7732; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7733; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7734; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7735; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7736; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7737; GFX7-NEXT:    s_mov_b32 s6, s4
7738; GFX7-NEXT:    s_mov_b32 s7, s5
7739; GFX7-NEXT:    s_mov_b32 s11, s12
7740; GFX7-NEXT:    s_mov_b32 s10, s13
7741; GFX7-NEXT:    s_add_u32 s6, s6, s11
7742; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7743; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7744; GFX7-NEXT:    s_mov_b32 s7, s10
7745; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7746; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7747; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7748; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7749; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7750; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7751; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7752; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7753; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7754; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7755; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7756; GFX7-NEXT:    s_waitcnt vmcnt(0)
7757; GFX7-NEXT:    flat_store_dword v[0:1], v2
7758; GFX7-NEXT:    s_endpgm
7759;
7760; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7761; GFX10-WGP:       ; %bb.0: ; %entry
7762; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7763; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7764; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7765; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7766; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7767; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7768; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7769; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7770; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7771; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7772; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7773; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7774; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7775; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7776; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7777; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7778; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7779; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7780; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7781; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7782; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7783; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7784; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7785; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7786; GFX10-WGP-NEXT:    buffer_gl0_inv
7787; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7788; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7789; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7790; GFX10-WGP-NEXT:    s_endpgm
7791;
7792; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7793; GFX10-CU:       ; %bb.0: ; %entry
7794; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7795; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7796; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7797; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7798; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7799; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7800; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7801; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7802; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7803; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7804; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7805; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7806; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7807; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7808; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7809; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7810; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7811; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7812; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7813; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7814; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7815; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7816; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7817; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7818; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7819; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7820; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7821; GFX10-CU-NEXT:    s_endpgm
7822;
7823; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7824; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7825; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7826; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7827; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7828; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7829; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7830; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7831; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7832; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7833; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7834; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7835; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7836; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7837; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7838; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7839; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7840; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7841; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7842; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7843; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7845; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7846; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7847; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7848; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7849; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7850; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7851; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7852; SKIP-CACHE-INV-NEXT:    s_endpgm
7853;
7854; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7855; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7856; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7857; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7858; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7859; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7860; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7861; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7862; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7863; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7864; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7865; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7866; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7867; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7868; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7870; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7871; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7872;
7873; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7874; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7875; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7876; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7877; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7878; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7879; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7880; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7881; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7882; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7883; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7884; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7885; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7886; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7887; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7888; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7889; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7890; GFX90A-TGSPLIT-NEXT:    s_endpgm
7891;
7892; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7893; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7894; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7895; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7896; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7897; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7898; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7899; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7900; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7901; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7902; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7903; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7904; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7905; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7906; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7907; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7908; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7909; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7910;
7911; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7912; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7913; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7914; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7915; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7916; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7917; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7918; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7919; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7920; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7921; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7922; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7923; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7924; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7925; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
7926; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7927; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7928; GFX940-TGSPLIT-NEXT:    s_endpgm
7929;
7930; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7931; GFX11-WGP:       ; %bb.0: ; %entry
7932; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7933; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7934; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7935; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7936; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7937; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7938; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7939; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7940; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7941; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7942; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7943; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7944; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7945; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7946; GFX11-WGP-NEXT:    buffer_gl0_inv
7947; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7948; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7949; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7950; GFX11-WGP-NEXT:    s_endpgm
7951;
7952; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7953; GFX11-CU:       ; %bb.0: ; %entry
7954; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7955; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7956; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7957; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7958; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7959; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7960; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7961; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7962; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7963; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7964; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7965; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7966; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7967; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7968; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7969; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7970; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7971; GFX11-CU-NEXT:    s_endpgm
7972;
7973; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7974; GFX12-WGP:       ; %bb.0: ; %entry
7975; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7976; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7977; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7978; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7979; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7980; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7981; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7982; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7983; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7984; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7985; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7986; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7987; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
7988; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7989; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7990; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7991; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7992; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7993; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
7994; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7995; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7996; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7997; GFX12-WGP-NEXT:    s_endpgm
7998;
7999; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
8000; GFX12-CU:       ; %bb.0: ; %entry
8001; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8002; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8003; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8004; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8005; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8006; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8007; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8008; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8009; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8010; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8011; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8012; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8013; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8014; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8015; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8016; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8017; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8018; GFX12-CU-NEXT:    s_endpgm
8019    ptr %out, i32 %in, i32 %old) {
8020entry:
8021  %gep = getelementptr i32, ptr %out, i32 4
8022  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
8023  %val0 = extractvalue { i32, i1 } %val, 0
8024  store i32 %val0, ptr %out, align 4
8025  ret void
8026}
8027
8028define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
8029; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8030; GFX7:       ; %bb.0: ; %entry
8031; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8032; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8033; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8034; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8035; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8036; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8037; GFX7-NEXT:    s_mov_b32 s6, s4
8038; GFX7-NEXT:    s_mov_b32 s7, s5
8039; GFX7-NEXT:    s_mov_b32 s11, s12
8040; GFX7-NEXT:    s_mov_b32 s10, s13
8041; GFX7-NEXT:    s_add_u32 s6, s6, s11
8042; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8043; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8044; GFX7-NEXT:    s_mov_b32 s7, s10
8045; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8046; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8047; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8048; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8049; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8050; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8051; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8052; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8053; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8054; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8055; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8056; GFX7-NEXT:    s_waitcnt vmcnt(0)
8057; GFX7-NEXT:    flat_store_dword v[0:1], v2
8058; GFX7-NEXT:    s_endpgm
8059;
8060; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8061; GFX10-WGP:       ; %bb.0: ; %entry
8062; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8063; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8064; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8065; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8066; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8067; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8068; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8069; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8070; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8071; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8072; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8073; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8074; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8075; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8076; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8077; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8078; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8079; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8080; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8081; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8082; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8083; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8084; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8085; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8086; GFX10-WGP-NEXT:    buffer_gl0_inv
8087; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8088; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8089; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8090; GFX10-WGP-NEXT:    s_endpgm
8091;
8092; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8093; GFX10-CU:       ; %bb.0: ; %entry
8094; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8095; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8096; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8097; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8098; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8099; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8100; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8101; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8102; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8103; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8104; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8105; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8106; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8107; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8108; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8109; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8110; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8111; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8112; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8113; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8114; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8115; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8116; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8117; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8118; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8119; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8120; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8121; GFX10-CU-NEXT:    s_endpgm
8122;
8123; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8124; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8125; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8126; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8127; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8128; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8129; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8130; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8131; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8132; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8133; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8134; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8135; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8136; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8137; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8138; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8139; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8140; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8141; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8145; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8146; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8147; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8148; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8150; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8151; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8152; SKIP-CACHE-INV-NEXT:    s_endpgm
8153;
8154; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8155; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8156; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8157; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8158; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8159; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8160; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8161; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8162; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8163; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8164; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8165; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8166; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8167; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8168; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8169; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8170; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8171; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8172;
8173; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8174; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8175; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8176; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8177; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8178; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8179; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8180; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8181; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8182; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8183; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8184; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8185; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8186; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8187; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8188; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8189; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8190; GFX90A-TGSPLIT-NEXT:    s_endpgm
8191;
8192; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8193; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8194; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8195; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8196; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8197; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8198; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8199; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8200; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8201; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8202; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8203; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8204; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8205; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8206; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8207; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8208; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8209; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8210;
8211; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8212; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8213; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8214; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8215; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8216; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8217; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8218; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8219; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8220; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8221; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8222; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8223; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8224; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8225; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
8226; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8227; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8228; GFX940-TGSPLIT-NEXT:    s_endpgm
8229;
8230; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8231; GFX11-WGP:       ; %bb.0: ; %entry
8232; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8233; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8234; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8235; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8236; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8237; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8238; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8239; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8240; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8241; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8242; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8243; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8244; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8245; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8246; GFX11-WGP-NEXT:    buffer_gl0_inv
8247; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8248; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8249; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8250; GFX11-WGP-NEXT:    s_endpgm
8251;
8252; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8253; GFX11-CU:       ; %bb.0: ; %entry
8254; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8255; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8256; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8257; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8258; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8259; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8260; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8261; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8262; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8263; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8264; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8265; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8266; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8267; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8268; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8269; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8270; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8271; GFX11-CU-NEXT:    s_endpgm
8272;
8273; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8274; GFX12-WGP:       ; %bb.0: ; %entry
8275; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8276; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8277; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8278; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8279; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8280; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8281; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8282; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8283; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8284; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8285; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8286; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8287; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
8288; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8289; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8290; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8291; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8292; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8293; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
8294; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8295; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8296; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8297; GFX12-WGP-NEXT:    s_endpgm
8298;
8299; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8300; GFX12-CU:       ; %bb.0: ; %entry
8301; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8302; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8303; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8304; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8305; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8306; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8307; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8308; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8309; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8310; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8311; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8312; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8313; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8314; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8315; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8316; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8317; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8318; GFX12-CU-NEXT:    s_endpgm
8319    ptr %out, i32 %in, i32 %old) {
8320entry:
8321  %gep = getelementptr i32, ptr %out, i32 4
8322  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
8323  %val0 = extractvalue { i32, i1 } %val, 0
8324  store i32 %val0, ptr %out, align 4
8325  ret void
8326}
8327
8328define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
8329; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8330; GFX7:       ; %bb.0: ; %entry
8331; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8332; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8333; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8334; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8335; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8336; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8337; GFX7-NEXT:    s_mov_b32 s6, s4
8338; GFX7-NEXT:    s_mov_b32 s7, s5
8339; GFX7-NEXT:    s_mov_b32 s11, s12
8340; GFX7-NEXT:    s_mov_b32 s10, s13
8341; GFX7-NEXT:    s_add_u32 s6, s6, s11
8342; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8343; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8344; GFX7-NEXT:    s_mov_b32 s7, s10
8345; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8346; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8347; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8348; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8349; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8350; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8351; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8352; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8353; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8354; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8355; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8356; GFX7-NEXT:    s_waitcnt vmcnt(0)
8357; GFX7-NEXT:    flat_store_dword v[0:1], v2
8358; GFX7-NEXT:    s_endpgm
8359;
8360; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8361; GFX10-WGP:       ; %bb.0: ; %entry
8362; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8363; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8364; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8365; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8366; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8367; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8368; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8369; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8370; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8371; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8372; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8373; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8374; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8375; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8376; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8377; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8378; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8379; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8380; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8381; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8382; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8383; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8384; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8385; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8386; GFX10-WGP-NEXT:    buffer_gl0_inv
8387; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8388; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8389; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8390; GFX10-WGP-NEXT:    s_endpgm
8391;
8392; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8393; GFX10-CU:       ; %bb.0: ; %entry
8394; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8395; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8396; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8397; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8398; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8399; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8400; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8401; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8402; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8403; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8404; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8405; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8406; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8407; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8408; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8409; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8410; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8411; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8412; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8413; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8414; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8415; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8416; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8417; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8418; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8419; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8420; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8421; GFX10-CU-NEXT:    s_endpgm
8422;
8423; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8424; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8425; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8426; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8427; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8428; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8429; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8430; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8431; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8432; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8433; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8434; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8435; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8436; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8437; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8438; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8439; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8440; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8441; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8443; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8444; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8445; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8446; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8447; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8450; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8451; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8452; SKIP-CACHE-INV-NEXT:    s_endpgm
8453;
8454; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8455; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8456; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8457; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8458; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8459; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8460; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8461; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8462; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8463; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8464; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8465; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8466; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8467; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8468; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8469; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8470; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8471; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8472;
8473; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8474; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8475; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8476; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8477; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8478; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8479; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8480; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8481; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8482; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8483; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8484; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8485; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8486; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8487; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8488; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8489; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8490; GFX90A-TGSPLIT-NEXT:    s_endpgm
8491;
8492; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8493; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8494; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8495; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8496; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8497; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8498; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8499; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8500; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8501; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8502; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8503; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8504; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8505; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8506; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8507; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8508; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8509; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8510;
8511; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8512; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8513; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8514; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8515; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8516; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8517; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8518; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8519; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8520; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8521; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8522; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8523; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8524; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8525; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
8526; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8527; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8528; GFX940-TGSPLIT-NEXT:    s_endpgm
8529;
8530; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8531; GFX11-WGP:       ; %bb.0: ; %entry
8532; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8533; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8534; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8535; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8536; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8537; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8538; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8539; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8540; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8541; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8542; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8543; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8544; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8545; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8546; GFX11-WGP-NEXT:    buffer_gl0_inv
8547; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8548; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8549; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8550; GFX11-WGP-NEXT:    s_endpgm
8551;
8552; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8553; GFX11-CU:       ; %bb.0: ; %entry
8554; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8555; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8556; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8557; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8558; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8559; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8560; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8561; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8562; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8563; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8564; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8565; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8566; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8567; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8568; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8569; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8570; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8571; GFX11-CU-NEXT:    s_endpgm
8572;
8573; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8574; GFX12-WGP:       ; %bb.0: ; %entry
8575; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8576; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8577; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8578; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8579; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8580; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8581; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8582; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8583; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8584; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8585; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8586; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8587; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
8588; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8589; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8590; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8591; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8592; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8593; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
8594; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8595; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8596; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8597; GFX12-WGP-NEXT:    s_endpgm
8598;
8599; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8600; GFX12-CU:       ; %bb.0: ; %entry
8601; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8602; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8603; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8604; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8605; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8606; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8607; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8608; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8609; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8610; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8611; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8612; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8613; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8614; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8615; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8616; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8617; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8618; GFX12-CU-NEXT:    s_endpgm
8619    ptr %out, i32 %in, i32 %old) {
8620entry:
8621  %gep = getelementptr i32, ptr %out, i32 4
8622  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
8623  %val0 = extractvalue { i32, i1 } %val, 0
8624  store i32 %val0, ptr %out, align 4
8625  ret void
8626}
8627
8628define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
8629; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8630; GFX7:       ; %bb.0: ; %entry
8631; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8632; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8633; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8634; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8635; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8636; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8637; GFX7-NEXT:    s_mov_b32 s6, s4
8638; GFX7-NEXT:    s_mov_b32 s7, s5
8639; GFX7-NEXT:    s_mov_b32 s11, s12
8640; GFX7-NEXT:    s_mov_b32 s10, s13
8641; GFX7-NEXT:    s_add_u32 s6, s6, s11
8642; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8643; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8644; GFX7-NEXT:    s_mov_b32 s7, s10
8645; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8646; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8647; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8648; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8649; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8650; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8651; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8652; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8653; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8654; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8655; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8656; GFX7-NEXT:    s_waitcnt vmcnt(0)
8657; GFX7-NEXT:    flat_store_dword v[0:1], v2
8658; GFX7-NEXT:    s_endpgm
8659;
8660; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8661; GFX10-WGP:       ; %bb.0: ; %entry
8662; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8663; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8664; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8665; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8666; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8667; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8668; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8669; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8670; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8671; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8672; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8673; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8674; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8675; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8676; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8677; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8678; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8679; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8680; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8681; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8682; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8683; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8684; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8685; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8686; GFX10-WGP-NEXT:    buffer_gl0_inv
8687; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8688; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8689; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8690; GFX10-WGP-NEXT:    s_endpgm
8691;
8692; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8693; GFX10-CU:       ; %bb.0: ; %entry
8694; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8695; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8696; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8697; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8698; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8699; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8700; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8701; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8702; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8703; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8704; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8705; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8706; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8707; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8708; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8709; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8710; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8711; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8712; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8713; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8714; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8715; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8716; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8717; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8718; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8719; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8720; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8721; GFX10-CU-NEXT:    s_endpgm
8722;
8723; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8724; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8725; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8726; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8727; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8728; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8729; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8730; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8731; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8732; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8733; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8734; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8735; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8736; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8737; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8738; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8739; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8740; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8741; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8742; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8743; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8744; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8745; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8746; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8747; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8748; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8749; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8750; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8751; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8752; SKIP-CACHE-INV-NEXT:    s_endpgm
8753;
8754; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8755; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8756; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8757; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8758; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8759; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8760; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8761; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8762; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8763; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8764; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8765; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8766; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8767; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8768; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8769; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8770; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8771; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8772;
8773; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8774; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8775; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8776; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8777; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8778; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8779; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8780; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8781; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8782; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8783; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8784; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8785; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8786; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8787; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8788; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8789; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8790; GFX90A-TGSPLIT-NEXT:    s_endpgm
8791;
8792; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8793; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8794; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8795; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8796; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8797; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8798; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8799; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8800; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8802; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8803; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8804; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8805; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8806; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8807; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8808; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8809; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8810;
8811; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8812; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8813; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8814; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8815; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8816; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8817; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8819; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8820; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8821; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8822; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8823; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8824; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8825; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
8826; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8827; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8828; GFX940-TGSPLIT-NEXT:    s_endpgm
8829;
8830; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8831; GFX11-WGP:       ; %bb.0: ; %entry
8832; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8833; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8834; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8835; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8836; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8837; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8838; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8839; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8840; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8841; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8842; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8843; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8844; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8845; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8846; GFX11-WGP-NEXT:    buffer_gl0_inv
8847; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8848; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8849; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8850; GFX11-WGP-NEXT:    s_endpgm
8851;
8852; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8853; GFX11-CU:       ; %bb.0: ; %entry
8854; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8855; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8856; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8857; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8858; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8859; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8860; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8861; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8862; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8863; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8864; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8865; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8866; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8867; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8868; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8869; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8870; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8871; GFX11-CU-NEXT:    s_endpgm
8872;
8873; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8874; GFX12-WGP:       ; %bb.0: ; %entry
8875; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8876; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8877; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8878; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8879; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8880; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8881; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8882; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8883; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8884; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8885; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8886; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8887; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
8888; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8889; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8890; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8891; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
8892; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8893; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8894; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8895; GFX12-WGP-NEXT:    s_endpgm
8896;
8897; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8898; GFX12-CU:       ; %bb.0: ; %entry
8899; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8900; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8901; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8902; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8903; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8904; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8905; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8906; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8907; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8908; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8909; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8910; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8911; GFX12-CU-NEXT:    s_wait_dscnt 0x0
8912; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8913; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8914; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8915; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8916; GFX12-CU-NEXT:    s_endpgm
8917    ptr %out, i32 %in, i32 %old) {
8918entry:
8919  %gep = getelementptr i32, ptr %out, i32 4
8920  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
8921  %val0 = extractvalue { i32, i1 } %val, 0
8922  store i32 %val0, ptr %out, align 4
8923  ret void
8924}
8925
8926define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
8927; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8928; GFX7:       ; %bb.0: ; %entry
8929; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8930; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8931; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8932; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8933; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8934; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8935; GFX7-NEXT:    s_mov_b32 s6, s4
8936; GFX7-NEXT:    s_mov_b32 s7, s5
8937; GFX7-NEXT:    s_mov_b32 s11, s12
8938; GFX7-NEXT:    s_mov_b32 s10, s13
8939; GFX7-NEXT:    s_add_u32 s6, s6, s11
8940; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8941; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8942; GFX7-NEXT:    s_mov_b32 s7, s10
8943; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8944; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8945; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8946; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8947; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8948; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8949; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8950; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8951; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8952; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8953; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8954; GFX7-NEXT:    s_waitcnt vmcnt(0)
8955; GFX7-NEXT:    flat_store_dword v[0:1], v2
8956; GFX7-NEXT:    s_endpgm
8957;
8958; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8959; GFX10-WGP:       ; %bb.0: ; %entry
8960; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8961; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8962; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8963; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8964; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8965; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8966; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8967; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8968; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8969; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8970; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8971; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8972; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8973; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8974; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8975; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8976; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8977; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8978; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8979; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8980; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8981; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8982; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8983; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8984; GFX10-WGP-NEXT:    buffer_gl0_inv
8985; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8986; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8987; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8988; GFX10-WGP-NEXT:    s_endpgm
8989;
8990; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8991; GFX10-CU:       ; %bb.0: ; %entry
8992; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8993; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8994; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8995; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8996; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8997; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8998; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8999; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9000; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9001; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9002; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9003; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9004; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9005; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9006; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9007; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9008; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9009; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9010; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9011; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9012; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9013; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9014; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9015; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9016; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9017; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9018; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9019; GFX10-CU-NEXT:    s_endpgm
9020;
9021; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9022; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9023; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9024; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9025; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9026; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9027; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9028; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9029; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9030; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9031; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9032; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9033; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9034; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9035; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9036; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9037; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9038; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9039; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9040; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9041; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9042; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9043; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9044; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9045; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9046; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9047; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9048; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9049; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9050; SKIP-CACHE-INV-NEXT:    s_endpgm
9051;
9052; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9053; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9054; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9055; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9056; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9057; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9058; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9059; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9060; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9061; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9062; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9063; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9064; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9065; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9066; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9067; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9068; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9069; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9070;
9071; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9072; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9073; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9074; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9075; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9076; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9077; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9078; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9079; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9081; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9082; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9083; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9084; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9085; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9086; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9087; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9088; GFX90A-TGSPLIT-NEXT:    s_endpgm
9089;
9090; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9091; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9092; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9093; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9094; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9095; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9096; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9097; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9098; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9099; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9100; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9101; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9102; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9103; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9104; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9105; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9106; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9107; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9108;
9109; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9110; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9111; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9112; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9113; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9114; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9115; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9116; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9117; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9118; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9119; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9120; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9121; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9122; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9123; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
9124; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9125; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9126; GFX940-TGSPLIT-NEXT:    s_endpgm
9127;
9128; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9129; GFX11-WGP:       ; %bb.0: ; %entry
9130; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9131; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9132; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9133; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9134; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9135; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9136; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9137; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9138; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9139; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9140; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9141; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9142; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9143; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9144; GFX11-WGP-NEXT:    buffer_gl0_inv
9145; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9146; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9147; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9148; GFX11-WGP-NEXT:    s_endpgm
9149;
9150; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9151; GFX11-CU:       ; %bb.0: ; %entry
9152; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9153; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9154; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9155; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9156; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9157; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9158; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9159; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9160; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9161; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9162; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9163; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9164; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9165; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9166; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9167; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9168; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9169; GFX11-CU-NEXT:    s_endpgm
9170;
9171; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9172; GFX12-WGP:       ; %bb.0: ; %entry
9173; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9174; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9175; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9176; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9177; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9178; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9179; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9180; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9181; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9182; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9183; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9184; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9185; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9186; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9187; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9188; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9189; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9190; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9191; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
9192; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9193; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9194; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9195; GFX12-WGP-NEXT:    s_endpgm
9196;
9197; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9198; GFX12-CU:       ; %bb.0: ; %entry
9199; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9200; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9201; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9202; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9203; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9204; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9205; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9206; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9207; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9208; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9209; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9210; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9211; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9212; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9213; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9214; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9215; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9216; GFX12-CU-NEXT:    s_endpgm
9217    ptr %out, i32 %in, i32 %old) {
9218entry:
9219  %gep = getelementptr i32, ptr %out, i32 4
9220  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
9221  %val0 = extractvalue { i32, i1 } %val, 0
9222  store i32 %val0, ptr %out, align 4
9223  ret void
9224}
9225
9226define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
9227; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9228; GFX7:       ; %bb.0: ; %entry
9229; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9230; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9231; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9232; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9233; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9234; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9235; GFX7-NEXT:    s_mov_b32 s6, s4
9236; GFX7-NEXT:    s_mov_b32 s7, s5
9237; GFX7-NEXT:    s_mov_b32 s11, s12
9238; GFX7-NEXT:    s_mov_b32 s10, s13
9239; GFX7-NEXT:    s_add_u32 s6, s6, s11
9240; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9241; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9242; GFX7-NEXT:    s_mov_b32 s7, s10
9243; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9244; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9245; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9246; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9247; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9248; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9249; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9250; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9251; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9252; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9253; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9254; GFX7-NEXT:    s_waitcnt vmcnt(0)
9255; GFX7-NEXT:    flat_store_dword v[0:1], v2
9256; GFX7-NEXT:    s_endpgm
9257;
9258; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9259; GFX10-WGP:       ; %bb.0: ; %entry
9260; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9261; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9262; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9263; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9264; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9265; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9266; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9267; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9268; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9269; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9270; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9271; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9272; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9273; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9274; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9275; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9276; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9277; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9278; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9279; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9280; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9281; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9282; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9283; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9284; GFX10-WGP-NEXT:    buffer_gl0_inv
9285; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9286; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9287; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9288; GFX10-WGP-NEXT:    s_endpgm
9289;
9290; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9291; GFX10-CU:       ; %bb.0: ; %entry
9292; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9293; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9294; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9295; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9296; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9297; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9298; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9299; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9300; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9301; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9302; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9303; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9304; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9305; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9306; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9307; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9308; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9309; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9310; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9311; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9312; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9313; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9314; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9315; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9316; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9317; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9318; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9319; GFX10-CU-NEXT:    s_endpgm
9320;
9321; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9322; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9323; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9324; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9325; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9326; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9327; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9328; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9329; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9330; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9331; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9332; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9333; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9334; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9335; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9336; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9338; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9339; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9340; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9341; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9342; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9343; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9344; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9345; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9347; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9348; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9349; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9350; SKIP-CACHE-INV-NEXT:    s_endpgm
9351;
9352; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9353; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9354; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9355; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9356; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9357; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9358; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9359; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9360; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9361; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9362; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9363; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9364; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9365; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9366; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9367; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9368; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9369; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9370;
9371; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9372; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9373; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9374; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9375; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9376; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9377; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9378; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9379; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9380; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9381; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9382; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9383; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9384; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9385; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9386; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9387; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9388; GFX90A-TGSPLIT-NEXT:    s_endpgm
9389;
9390; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9391; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9392; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9393; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9394; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9395; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9396; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9397; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9398; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9399; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9400; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9401; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9402; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9403; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9404; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9405; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9406; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9407; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9408;
9409; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9410; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9411; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9412; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9413; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9414; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9415; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9416; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9417; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9418; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9419; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9420; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9421; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9422; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9423; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
9424; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9425; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9426; GFX940-TGSPLIT-NEXT:    s_endpgm
9427;
9428; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9429; GFX11-WGP:       ; %bb.0: ; %entry
9430; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9431; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9432; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9433; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9434; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9435; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9436; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9437; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9438; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9439; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9440; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9441; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9442; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9443; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9444; GFX11-WGP-NEXT:    buffer_gl0_inv
9445; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9446; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9447; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9448; GFX11-WGP-NEXT:    s_endpgm
9449;
9450; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9451; GFX11-CU:       ; %bb.0: ; %entry
9452; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9453; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9454; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9455; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9456; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9457; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9458; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9459; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9460; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9461; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9462; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9463; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9464; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9465; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9466; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9467; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9468; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9469; GFX11-CU-NEXT:    s_endpgm
9470;
9471; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9472; GFX12-WGP:       ; %bb.0: ; %entry
9473; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9474; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9475; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9476; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9477; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9478; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9479; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9480; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9481; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9482; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9483; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9484; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9485; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9486; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9487; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9488; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9489; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9490; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9491; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
9492; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9493; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9494; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9495; GFX12-WGP-NEXT:    s_endpgm
9496;
9497; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9498; GFX12-CU:       ; %bb.0: ; %entry
9499; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9500; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9501; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9502; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9503; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9504; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9505; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9506; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9507; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9508; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9509; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9510; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9511; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9512; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9513; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9514; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9515; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9516; GFX12-CU-NEXT:    s_endpgm
9517    ptr %out, i32 %in, i32 %old) {
9518entry:
9519  %gep = getelementptr i32, ptr %out, i32 4
9520  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
9521  %val0 = extractvalue { i32, i1 } %val, 0
9522  store i32 %val0, ptr %out, align 4
9523  ret void
9524}
9525
9526define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
9527; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9528; GFX7:       ; %bb.0: ; %entry
9529; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9530; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9531; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9532; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9533; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9534; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9535; GFX7-NEXT:    s_mov_b32 s6, s4
9536; GFX7-NEXT:    s_mov_b32 s7, s5
9537; GFX7-NEXT:    s_mov_b32 s11, s12
9538; GFX7-NEXT:    s_mov_b32 s10, s13
9539; GFX7-NEXT:    s_add_u32 s6, s6, s11
9540; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9541; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9542; GFX7-NEXT:    s_mov_b32 s7, s10
9543; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9544; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9545; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9546; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9547; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9548; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9549; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9550; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9551; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9552; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9553; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9554; GFX7-NEXT:    s_waitcnt vmcnt(0)
9555; GFX7-NEXT:    flat_store_dword v[0:1], v2
9556; GFX7-NEXT:    s_endpgm
9557;
9558; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9559; GFX10-WGP:       ; %bb.0: ; %entry
9560; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9561; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9562; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9563; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9564; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9566; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9567; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9568; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9569; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9570; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9571; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9572; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9573; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9574; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9575; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9576; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9577; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9578; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9579; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9580; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9581; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9582; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9583; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9584; GFX10-WGP-NEXT:    buffer_gl0_inv
9585; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9586; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9587; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9588; GFX10-WGP-NEXT:    s_endpgm
9589;
9590; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9591; GFX10-CU:       ; %bb.0: ; %entry
9592; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9593; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9594; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9595; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9596; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9597; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9598; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9599; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9600; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9601; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9602; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9603; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9604; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9605; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9606; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9607; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9608; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9609; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9610; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9611; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9612; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9613; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9614; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9615; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9616; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9617; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9618; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9619; GFX10-CU-NEXT:    s_endpgm
9620;
9621; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9622; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9623; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9624; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9625; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9626; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9627; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9628; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9630; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9631; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9632; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9633; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9634; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9635; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9636; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9639; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9640; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9641; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9642; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9643; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9644; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9645; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9648; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9649; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9650; SKIP-CACHE-INV-NEXT:    s_endpgm
9651;
9652; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9653; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9654; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9655; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9657; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9659; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9660; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9661; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9662; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9663; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9664; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9665; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9666; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9667; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9668; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9669; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9670;
9671; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9672; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9673; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9674; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9675; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9676; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9677; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9679; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9680; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9681; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9682; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9683; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9684; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9685; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9686; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9687; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9688; GFX90A-TGSPLIT-NEXT:    s_endpgm
9689;
9690; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9691; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9692; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9693; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9694; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9695; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9696; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9697; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9698; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9699; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9700; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9701; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9702; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9703; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9704; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9705; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9706; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9707; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9708;
9709; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9710; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9711; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9712; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9713; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9714; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9715; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9716; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9717; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9718; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9719; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9720; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9721; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9722; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9723; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
9724; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9725; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9726; GFX940-TGSPLIT-NEXT:    s_endpgm
9727;
9728; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9729; GFX11-WGP:       ; %bb.0: ; %entry
9730; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9731; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9732; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9733; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9734; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9735; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9736; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9737; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9738; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9739; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9740; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9741; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9742; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9743; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9744; GFX11-WGP-NEXT:    buffer_gl0_inv
9745; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9746; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9747; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9748; GFX11-WGP-NEXT:    s_endpgm
9749;
9750; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9751; GFX11-CU:       ; %bb.0: ; %entry
9752; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9753; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9754; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9755; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9756; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9757; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9758; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9759; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9760; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9761; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9762; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9763; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9764; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9765; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9766; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9767; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9768; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9769; GFX11-CU-NEXT:    s_endpgm
9770;
9771; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9772; GFX12-WGP:       ; %bb.0: ; %entry
9773; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9774; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9775; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9776; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9777; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9778; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9779; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9780; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9781; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9782; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9783; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9784; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9785; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9786; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9787; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9788; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9789; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9790; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9791; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
9792; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9793; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9794; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9795; GFX12-WGP-NEXT:    s_endpgm
9796;
9797; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9798; GFX12-CU:       ; %bb.0: ; %entry
9799; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9800; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9801; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9802; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9803; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9804; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9805; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9806; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9807; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9808; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9809; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9810; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9811; GFX12-CU-NEXT:    s_wait_dscnt 0x0
9812; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9813; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9814; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9815; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9816; GFX12-CU-NEXT:    s_endpgm
9817    ptr %out, i32 %in, i32 %old) {
9818entry:
9819  %gep = getelementptr i32, ptr %out, i32 4
9820  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
9821  %val0 = extractvalue { i32, i1 } %val, 0
9822  store i32 %val0, ptr %out, align 4
9823  ret void
9824}
9825
9826define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
9827; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
9828; GFX7:       ; %bb.0: ; %entry
9829; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9830; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9831; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9832; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9833; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9834; GFX7-NEXT:    flat_load_dword v2, v[0:1]
9835; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9836; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9837; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9838; GFX7-NEXT:    flat_store_dword v[0:1], v2
9839; GFX7-NEXT:    s_endpgm
9840;
9841; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9842; GFX10-WGP:       ; %bb.0: ; %entry
9843; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9844; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9845; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9846; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9847; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9848; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
9849; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9850; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9851; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9852; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9853; GFX10-WGP-NEXT:    s_endpgm
9854;
9855; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
9856; GFX10-CU:       ; %bb.0: ; %entry
9857; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9858; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9859; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9860; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9861; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9862; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
9863; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9864; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9865; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9866; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9867; GFX10-CU-NEXT:    s_endpgm
9868;
9869; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load:
9870; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9871; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9872; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
9873; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9874; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9876; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
9877; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9879; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9880; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9881; SKIP-CACHE-INV-NEXT:    s_endpgm
9882;
9883; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9884; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9885; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9886; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9887; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9888; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9889; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9890; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9891; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9892; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9893; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9894;
9895; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9896; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9897; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9898; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9899; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9900; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9901; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9902; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9903; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9904; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9905; GFX90A-TGSPLIT-NEXT:    s_endpgm
9906;
9907; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9908; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9909; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9910; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9911; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9912; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9913; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9914; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9915; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9916; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9917; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9918;
9919; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9920; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9921; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9922; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9923; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9924; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9925; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9926; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9927; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9928; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9929; GFX940-TGSPLIT-NEXT:    s_endpgm
9930;
9931; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9932; GFX11-WGP:       ; %bb.0: ; %entry
9933; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9934; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9935; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9936; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9937; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
9938; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
9939; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9940; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9941; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9942; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9943; GFX11-WGP-NEXT:    s_endpgm
9944;
9945; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load:
9946; GFX11-CU:       ; %bb.0: ; %entry
9947; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9948; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9949; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9950; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9951; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
9952; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
9953; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9954; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9955; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9956; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9957; GFX11-CU-NEXT:    s_endpgm
9958;
9959; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9960; GFX12-WGP:       ; %bb.0: ; %entry
9961; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9962; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9963; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9964; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9965; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
9966; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
9967; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9968; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9969; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9970; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9971; GFX12-WGP-NEXT:    s_endpgm
9972;
9973; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
9974; GFX12-CU:       ; %bb.0: ; %entry
9975; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9976; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9977; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9978; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9979; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
9980; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
9981; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9982; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9983; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9984; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9985; GFX12-CU-NEXT:    s_endpgm
9986    ptr %in, ptr %out) {
9987entry:
9988  %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4
9989  store i32 %val, ptr %out
9990  ret void
9991}
9992
9993define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
9994; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
9995; GFX7:       ; %bb.0: ; %entry
9996; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9997; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9998; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9999; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10000; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10001; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10002; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10003; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10004; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10005; GFX7-NEXT:    flat_store_dword v[0:1], v2
10006; GFX7-NEXT:    s_endpgm
10007;
10008; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10009; GFX10-WGP:       ; %bb.0: ; %entry
10010; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10011; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10012; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10013; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10014; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10015; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
10016; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10017; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10018; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10019; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10020; GFX10-WGP-NEXT:    s_endpgm
10021;
10022; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10023; GFX10-CU:       ; %bb.0: ; %entry
10024; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10025; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10026; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10027; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10028; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10029; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10030; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10031; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10032; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10033; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10034; GFX10-CU-NEXT:    s_endpgm
10035;
10036; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load:
10037; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10038; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10039; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10040; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10041; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10042; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10043; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10044; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10045; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10046; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10047; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10048; SKIP-CACHE-INV-NEXT:    s_endpgm
10049;
10050; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10051; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10052; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10053; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10054; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10055; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10056; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10057; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10058; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10059; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10060; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10061;
10062; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10063; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10064; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10065; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10066; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10067; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10068; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
10069; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10070; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10071; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10072; GFX90A-TGSPLIT-NEXT:    s_endpgm
10073;
10074; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10075; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10076; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10077; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10078; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10079; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10080; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10081; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10082; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10083; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10084; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10085;
10086; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10087; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10088; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10089; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10090; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10091; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10092; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10093; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10094; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10095; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10096; GFX940-TGSPLIT-NEXT:    s_endpgm
10097;
10098; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10099; GFX11-WGP:       ; %bb.0: ; %entry
10100; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10101; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10102; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10103; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10104; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10105; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
10106; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10107; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10108; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10109; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10110; GFX11-WGP-NEXT:    s_endpgm
10111;
10112; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10113; GFX11-CU:       ; %bb.0: ; %entry
10114; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10115; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10116; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10117; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10118; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10119; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10120; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10121; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10122; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10123; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10124; GFX11-CU-NEXT:    s_endpgm
10125;
10126; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10127; GFX12-WGP:       ; %bb.0: ; %entry
10128; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10129; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10130; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10131; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10132; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10133; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10134; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10135; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10136; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10137; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10138; GFX12-WGP-NEXT:    s_endpgm
10139;
10140; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10141; GFX12-CU:       ; %bb.0: ; %entry
10142; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10143; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10144; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10145; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10146; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10147; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10148; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10149; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10150; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10151; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10152; GFX12-CU-NEXT:    s_endpgm
10153    ptr %in, ptr %out) {
10154entry:
10155  %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4
10156  store i32 %val, ptr %out
10157  ret void
10158}
10159
10160define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
10161; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
10162; GFX7:       ; %bb.0: ; %entry
10163; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10164; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
10165; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10166; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10167; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10168; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10169; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10170; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10171; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10172; GFX7-NEXT:    flat_store_dword v[0:1], v2
10173; GFX7-NEXT:    s_endpgm
10174;
10175; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10176; GFX10-WGP:       ; %bb.0: ; %entry
10177; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10178; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10179; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10180; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10181; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10182; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
10183; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10184; GFX10-WGP-NEXT:    buffer_gl0_inv
10185; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10186; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10187; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10188; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10189; GFX10-WGP-NEXT:    s_endpgm
10190;
10191; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
10192; GFX10-CU:       ; %bb.0: ; %entry
10193; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10194; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10195; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10196; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10197; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10198; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10199; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10200; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10201; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10202; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10203; GFX10-CU-NEXT:    s_endpgm
10204;
10205; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load:
10206; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10207; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10208; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10209; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10210; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10212; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10215; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10216; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10217; SKIP-CACHE-INV-NEXT:    s_endpgm
10218;
10219; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10220; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10221; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10222; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10223; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10224; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10225; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10226; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10227; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10228; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10229; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10230;
10231; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10232; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10233; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10234; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10235; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10236; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10237; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
10238; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10239; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
10240; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10241; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10242; GFX90A-TGSPLIT-NEXT:    s_endpgm
10243;
10244; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10245; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10246; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10247; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10248; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10249; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10250; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10251; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10252; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10253; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10254; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10255;
10256; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10257; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10258; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10259; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10260; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10261; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10262; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10263; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10264; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
10265; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10266; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10267; GFX940-TGSPLIT-NEXT:    s_endpgm
10268;
10269; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10270; GFX11-WGP:       ; %bb.0: ; %entry
10271; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10272; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10273; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10274; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10275; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10276; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
10277; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10278; GFX11-WGP-NEXT:    buffer_gl0_inv
10279; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10280; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10281; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10282; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10283; GFX11-WGP-NEXT:    s_endpgm
10284;
10285; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load:
10286; GFX11-CU:       ; %bb.0: ; %entry
10287; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10288; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10289; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10290; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10291; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10292; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10293; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10294; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10295; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10296; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10297; GFX11-CU-NEXT:    s_endpgm
10298;
10299; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10300; GFX12-WGP:       ; %bb.0: ; %entry
10301; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10302; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10303; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10304; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10305; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10306; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10307; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10308; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
10309; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10310; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10311; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
10312; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10313; GFX12-WGP-NEXT:    s_endpgm
10314;
10315; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
10316; GFX12-CU:       ; %bb.0: ; %entry
10317; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10318; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10319; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10320; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10321; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10322; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10323; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10324; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10325; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10326; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10327; GFX12-CU-NEXT:    s_endpgm
10328    ptr %in, ptr %out) {
10329entry:
10330  %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4
10331  store i32 %val, ptr %out
10332  ret void
10333}
10334
10335define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
10336; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
10337; GFX7:       ; %bb.0: ; %entry
10338; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10339; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
10340; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10341; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10342; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10343; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10344; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10345; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10346; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10347; GFX7-NEXT:    flat_store_dword v[0:1], v2
10348; GFX7-NEXT:    s_endpgm
10349;
10350; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10351; GFX10-WGP:       ; %bb.0: ; %entry
10352; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10353; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10354; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10355; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10356; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10357; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10358; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10359; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
10360; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10361; GFX10-WGP-NEXT:    buffer_gl0_inv
10362; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10363; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10364; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10365; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10366; GFX10-WGP-NEXT:    s_endpgm
10367;
10368; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10369; GFX10-CU:       ; %bb.0: ; %entry
10370; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10371; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10372; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10373; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10374; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10375; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10376; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10377; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10378; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10379; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10380; GFX10-CU-NEXT:    s_endpgm
10381;
10382; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load:
10383; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10384; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10386; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10387; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10388; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10389; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10390; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10392; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10393; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10394; SKIP-CACHE-INV-NEXT:    s_endpgm
10395;
10396; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10397; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10398; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10399; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10400; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10401; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10402; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10403; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10404; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10405; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10406; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10407;
10408; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10409; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10410; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10411; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10412; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10413; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10414; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10415; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
10416; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10417; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
10418; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10419; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10420; GFX90A-TGSPLIT-NEXT:    s_endpgm
10421;
10422; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10423; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10424; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10425; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10426; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10427; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10428; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10429; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10430; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10431; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10432; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10433;
10434; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10435; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10436; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10437; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10438; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10439; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10440; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10441; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
10442; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10443; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
10444; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10445; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10446; GFX940-TGSPLIT-NEXT:    s_endpgm
10447;
10448; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10449; GFX11-WGP:       ; %bb.0: ; %entry
10450; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10451; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10452; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10453; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10454; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10455; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10456; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10457; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
10458; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10459; GFX11-WGP-NEXT:    buffer_gl0_inv
10460; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10461; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10462; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10463; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10464; GFX11-WGP-NEXT:    s_endpgm
10465;
10466; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10467; GFX11-CU:       ; %bb.0: ; %entry
10468; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10469; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10470; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10471; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10472; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10473; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10474; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10475; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10476; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10477; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10478; GFX11-CU-NEXT:    s_endpgm
10479;
10480; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10481; GFX12-WGP:       ; %bb.0: ; %entry
10482; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10483; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10484; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10485; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10486; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10487; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10488; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10489; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10490; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
10491; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10492; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10493; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10494; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10495; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
10496; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10497; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10498; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
10499; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10500; GFX12-WGP-NEXT:    s_endpgm
10501;
10502; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10503; GFX12-CU:       ; %bb.0: ; %entry
10504; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10505; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10506; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10507; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10508; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10509; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10510; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10511; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10512; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10513; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10514; GFX12-CU-NEXT:    s_endpgm
10515    ptr %in, ptr %out) {
10516entry:
10517  %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
10518  store i32 %val, ptr %out
10519  ret void
10520}
10521
10522define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
10523; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
10524; GFX7:       ; %bb.0: ; %entry
10525; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10526; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10527; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10528; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10529; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10530; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10531; GFX7-NEXT:    flat_store_dword v[0:1], v2
10532; GFX7-NEXT:    s_endpgm
10533;
10534; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10535; GFX10-WGP:       ; %bb.0: ; %entry
10536; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10537; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10538; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10539; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10540; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10541; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10542; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10543; GFX10-WGP-NEXT:    s_endpgm
10544;
10545; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
10546; GFX10-CU:       ; %bb.0: ; %entry
10547; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10548; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10549; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10550; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10551; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10552; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10553; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10554; GFX10-CU-NEXT:    s_endpgm
10555;
10556; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store:
10557; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10558; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10559; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10560; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10561; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10562; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10563; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10564; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10565; SKIP-CACHE-INV-NEXT:    s_endpgm
10566;
10567; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10568; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10569; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10570; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10571; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10572; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10573; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10574; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10575; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10576;
10577; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10578; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10579; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10580; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10581; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10582; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10583; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10584; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10585; GFX90A-TGSPLIT-NEXT:    s_endpgm
10586;
10587; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10588; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10589; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10590; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10591; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10592; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10593; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10594; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10595; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10596;
10597; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10598; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10599; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10600; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10601; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10602; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10603; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10604; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10605; GFX940-TGSPLIT-NEXT:    s_endpgm
10606;
10607; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10608; GFX11-WGP:       ; %bb.0: ; %entry
10609; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10610; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10611; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10612; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10613; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10614; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10615; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10616; GFX11-WGP-NEXT:    s_endpgm
10617;
10618; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store:
10619; GFX11-CU:       ; %bb.0: ; %entry
10620; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10621; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10622; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10623; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10624; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10625; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10626; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10627; GFX11-CU-NEXT:    s_endpgm
10628;
10629; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10630; GFX12-WGP:       ; %bb.0: ; %entry
10631; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10632; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10633; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10634; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10635; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10636; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10637; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10638; GFX12-WGP-NEXT:    s_endpgm
10639;
10640; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store:
10641; GFX12-CU:       ; %bb.0: ; %entry
10642; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10643; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10644; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10645; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10646; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10647; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10648; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10649; GFX12-CU-NEXT:    s_endpgm
10650    i32 %in, ptr %out) {
10651entry:
10652  store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
10653  ret void
10654}
10655
10656define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
10657; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
10658; GFX7:       ; %bb.0: ; %entry
10659; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10660; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10661; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10662; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10663; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10664; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10665; GFX7-NEXT:    flat_store_dword v[0:1], v2
10666; GFX7-NEXT:    s_endpgm
10667;
10668; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10669; GFX10-WGP:       ; %bb.0: ; %entry
10670; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10671; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10672; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10673; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10674; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10675; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10676; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10677; GFX10-WGP-NEXT:    s_endpgm
10678;
10679; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10680; GFX10-CU:       ; %bb.0: ; %entry
10681; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10682; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10683; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10684; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10685; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10686; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10687; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10688; GFX10-CU-NEXT:    s_endpgm
10689;
10690; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store:
10691; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10692; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10693; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10694; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10697; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10698; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10699; SKIP-CACHE-INV-NEXT:    s_endpgm
10700;
10701; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10702; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10703; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10704; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10705; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10706; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10707; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10708; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10709; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10710;
10711; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10712; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10713; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10714; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10715; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10716; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10717; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10718; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10719; GFX90A-TGSPLIT-NEXT:    s_endpgm
10720;
10721; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10722; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10723; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10724; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10725; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10726; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10727; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10728; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10729; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10730;
10731; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10732; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10733; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10734; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10735; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10736; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10737; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10738; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10739; GFX940-TGSPLIT-NEXT:    s_endpgm
10740;
10741; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10742; GFX11-WGP:       ; %bb.0: ; %entry
10743; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10744; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10745; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10746; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10747; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10748; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10749; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10750; GFX11-WGP-NEXT:    s_endpgm
10751;
10752; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10753; GFX11-CU:       ; %bb.0: ; %entry
10754; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10755; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10756; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10757; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10758; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10759; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10760; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10761; GFX11-CU-NEXT:    s_endpgm
10762;
10763; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10764; GFX12-WGP:       ; %bb.0: ; %entry
10765; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10766; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10767; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10768; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10769; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10770; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10771; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
10772; GFX12-WGP-NEXT:    s_endpgm
10773;
10774; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10775; GFX12-CU:       ; %bb.0: ; %entry
10776; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10777; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10778; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10779; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10780; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10781; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10782; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10783; GFX12-CU-NEXT:    s_endpgm
10784    i32 %in, ptr %out) {
10785entry:
10786  store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
10787  ret void
10788}
10789
10790define amdgpu_kernel void @flat_workgroup_one_as_release_store(
10791; GFX7-LABEL: flat_workgroup_one_as_release_store:
10792; GFX7:       ; %bb.0: ; %entry
10793; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10794; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10795; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10796; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10797; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10798; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10799; GFX7-NEXT:    flat_store_dword v[0:1], v2
10800; GFX7-NEXT:    s_endpgm
10801;
10802; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
10803; GFX10-WGP:       ; %bb.0: ; %entry
10804; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10805; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10806; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10807; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10808; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10809; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10810; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10811; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10812; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10813; GFX10-WGP-NEXT:    s_endpgm
10814;
10815; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
10816; GFX10-CU:       ; %bb.0: ; %entry
10817; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10818; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10819; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10820; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10821; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10822; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10823; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10824; GFX10-CU-NEXT:    s_endpgm
10825;
10826; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store:
10827; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10828; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10829; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10830; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10831; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10832; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10833; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10834; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10835; SKIP-CACHE-INV-NEXT:    s_endpgm
10836;
10837; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10838; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10839; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10840; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10841; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10842; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10843; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10844; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10845; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10846;
10847; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10848; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10849; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10850; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10851; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10852; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10853; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10854; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10855; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10856; GFX90A-TGSPLIT-NEXT:    s_endpgm
10857;
10858; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10859; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10860; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10861; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10862; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10863; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10864; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10865; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10866; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10867;
10868; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10869; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10870; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10871; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10872; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10873; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10874; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10875; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10876; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10877; GFX940-TGSPLIT-NEXT:    s_endpgm
10878;
10879; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store:
10880; GFX11-WGP:       ; %bb.0: ; %entry
10881; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10882; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10883; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10884; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10885; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10886; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10887; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10888; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10889; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10890; GFX11-WGP-NEXT:    s_endpgm
10891;
10892; GFX11-CU-LABEL: flat_workgroup_one_as_release_store:
10893; GFX11-CU:       ; %bb.0: ; %entry
10894; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10895; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10896; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10897; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10898; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10899; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10900; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10901; GFX11-CU-NEXT:    s_endpgm
10902;
10903; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store:
10904; GFX12-WGP:       ; %bb.0: ; %entry
10905; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10906; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10907; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10908; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10909; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10910; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10911; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10912; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10913; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10914; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
10915; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
10916; GFX12-WGP-NEXT:    s_endpgm
10917;
10918; GFX12-CU-LABEL: flat_workgroup_one_as_release_store:
10919; GFX12-CU:       ; %bb.0: ; %entry
10920; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10921; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10922; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10923; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10924; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10925; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10926; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10927; GFX12-CU-NEXT:    s_endpgm
10928    i32 %in, ptr %out) {
10929entry:
10930  store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
10931  ret void
10932}
10933
10934define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
10935; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
10936; GFX7:       ; %bb.0: ; %entry
10937; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10938; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10939; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10940; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10941; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10942; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10943; GFX7-NEXT:    flat_store_dword v[0:1], v2
10944; GFX7-NEXT:    s_endpgm
10945;
10946; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
10947; GFX10-WGP:       ; %bb.0: ; %entry
10948; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10949; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10950; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10951; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10952; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10953; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10954; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10955; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10956; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10957; GFX10-WGP-NEXT:    s_endpgm
10958;
10959; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
10960; GFX10-CU:       ; %bb.0: ; %entry
10961; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10962; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10963; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10964; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10965; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10966; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10967; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10968; GFX10-CU-NEXT:    s_endpgm
10969;
10970; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store:
10971; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10972; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10973; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10974; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10975; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10976; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10977; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10978; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10979; SKIP-CACHE-INV-NEXT:    s_endpgm
10980;
10981; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
10982; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10983; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10984; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10985; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10986; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10987; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10988; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10989; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10990;
10991; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
10992; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10993; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10994; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10995; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10996; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10997; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10998; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10999; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11000; GFX90A-TGSPLIT-NEXT:    s_endpgm
11001;
11002; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
11003; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11004; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
11005; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11006; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11007; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11008; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11009; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11010; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11011;
11012; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
11013; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11014; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
11015; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
11016; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11017; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11018; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11019; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11020; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11021; GFX940-TGSPLIT-NEXT:    s_endpgm
11022;
11023; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
11024; GFX11-WGP:       ; %bb.0: ; %entry
11025; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
11026; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
11027; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11028; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11029; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11030; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11031; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11032; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11033; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11034; GFX11-WGP-NEXT:    s_endpgm
11035;
11036; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
11037; GFX11-CU:       ; %bb.0: ; %entry
11038; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
11039; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
11040; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11041; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11042; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11043; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11044; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11045; GFX11-CU-NEXT:    s_endpgm
11046;
11047; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
11048; GFX12-WGP:       ; %bb.0: ; %entry
11049; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
11050; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
11051; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11052; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11053; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11054; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11055; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11056; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11057; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11058; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11059; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
11060; GFX12-WGP-NEXT:    s_endpgm
11061;
11062; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
11063; GFX12-CU:       ; %bb.0: ; %entry
11064; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
11065; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
11066; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11067; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11068; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11069; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11070; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11071; GFX12-CU-NEXT:    s_endpgm
11072    i32 %in, ptr %out) {
11073entry:
11074  store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
11075  ret void
11076}
11077
11078define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
11079; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11080; GFX7:       ; %bb.0: ; %entry
11081; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11082; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11083; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11084; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11085; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11086; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11087; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11088; GFX7-NEXT:    s_endpgm
11089;
11090; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11091; GFX10-WGP:       ; %bb.0: ; %entry
11092; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11093; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11094; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11095; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11096; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11097; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11098; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11099; GFX10-WGP-NEXT:    s_endpgm
11100;
11101; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11102; GFX10-CU:       ; %bb.0: ; %entry
11103; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11104; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11105; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11106; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11107; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11108; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11109; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11110; GFX10-CU-NEXT:    s_endpgm
11111;
11112; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11113; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11114; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11115; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11116; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11117; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11118; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11119; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11120; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11121; SKIP-CACHE-INV-NEXT:    s_endpgm
11122;
11123; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11124; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11125; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11126; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11127; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11128; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11129; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11130; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11131; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11132;
11133; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11134; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11135; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11136; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11137; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11138; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11139; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11140; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11141; GFX90A-TGSPLIT-NEXT:    s_endpgm
11142;
11143; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11144; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11145; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11146; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11147; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11148; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11149; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11150; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11151; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11152;
11153; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11154; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11155; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11156; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11157; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11158; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11159; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11160; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11161; GFX940-TGSPLIT-NEXT:    s_endpgm
11162;
11163; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11164; GFX11-WGP:       ; %bb.0: ; %entry
11165; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11166; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11167; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11168; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11169; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11170; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11171; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11172; GFX11-WGP-NEXT:    s_endpgm
11173;
11174; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11175; GFX11-CU:       ; %bb.0: ; %entry
11176; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11177; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11178; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11179; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11180; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11181; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11182; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11183; GFX11-CU-NEXT:    s_endpgm
11184;
11185; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11186; GFX12-WGP:       ; %bb.0: ; %entry
11187; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11188; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11189; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11190; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11191; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11192; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11193; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11194; GFX12-WGP-NEXT:    s_endpgm
11195;
11196; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11197; GFX12-CU:       ; %bb.0: ; %entry
11198; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11199; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11200; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11201; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11202; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11203; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11204; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11205; GFX12-CU-NEXT:    s_endpgm
11206    ptr %out, i32 %in) {
11207entry:
11208  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
11209  ret void
11210}
11211
11212define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
11213; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11214; GFX7:       ; %bb.0: ; %entry
11215; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11216; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11217; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11218; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11219; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11220; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11221; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11222; GFX7-NEXT:    s_endpgm
11223;
11224; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11225; GFX10-WGP:       ; %bb.0: ; %entry
11226; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11227; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11228; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11229; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11230; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11231; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11232; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11233; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11234; GFX10-WGP-NEXT:    buffer_gl0_inv
11235; GFX10-WGP-NEXT:    s_endpgm
11236;
11237; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11238; GFX10-CU:       ; %bb.0: ; %entry
11239; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11240; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11241; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11242; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11243; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11244; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11245; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11246; GFX10-CU-NEXT:    s_endpgm
11247;
11248; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11249; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11250; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11251; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11252; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11253; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11255; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11256; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11257; SKIP-CACHE-INV-NEXT:    s_endpgm
11258;
11259; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11260; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11262; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11263; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11264; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11265; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11266; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11267; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11268;
11269; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11270; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11271; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11272; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11273; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11274; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11275; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11276; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11277; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11278; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11279; GFX90A-TGSPLIT-NEXT:    s_endpgm
11280;
11281; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11282; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11283; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11284; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11285; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11286; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11287; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11288; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11289; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11290;
11291; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11292; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11293; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11294; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11295; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11296; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11297; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11298; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11299; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11300; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
11301; GFX940-TGSPLIT-NEXT:    s_endpgm
11302;
11303; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11304; GFX11-WGP:       ; %bb.0: ; %entry
11305; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11306; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11307; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11308; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11309; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11310; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11311; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11312; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11313; GFX11-WGP-NEXT:    buffer_gl0_inv
11314; GFX11-WGP-NEXT:    s_endpgm
11315;
11316; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11317; GFX11-CU:       ; %bb.0: ; %entry
11318; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11319; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11320; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11321; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11322; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11323; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11324; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11325; GFX11-CU-NEXT:    s_endpgm
11326;
11327; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11328; GFX12-WGP:       ; %bb.0: ; %entry
11329; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11330; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11331; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11332; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11333; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11334; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11335; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11336; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11337; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
11338; GFX12-WGP-NEXT:    s_endpgm
11339;
11340; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11341; GFX12-CU:       ; %bb.0: ; %entry
11342; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11343; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11344; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11345; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11346; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11347; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11348; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11349; GFX12-CU-NEXT:    s_endpgm
11350    ptr %out, i32 %in) {
11351entry:
11352  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
11353  ret void
11354}
11355
11356define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
11357; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
11358; GFX7:       ; %bb.0: ; %entry
11359; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11360; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11361; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11362; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11363; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11364; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11365; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11366; GFX7-NEXT:    s_endpgm
11367;
11368; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11369; GFX10-WGP:       ; %bb.0: ; %entry
11370; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11371; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11372; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11373; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11374; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11375; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11376; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11377; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11378; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11379; GFX10-WGP-NEXT:    s_endpgm
11380;
11381; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11382; GFX10-CU:       ; %bb.0: ; %entry
11383; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11384; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11385; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11388; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11389; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11390; GFX10-CU-NEXT:    s_endpgm
11391;
11392; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw:
11393; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11394; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11395; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11396; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11397; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11398; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11399; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11400; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11401; SKIP-CACHE-INV-NEXT:    s_endpgm
11402;
11403; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11404; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11405; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11406; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11407; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11408; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11409; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11410; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11411; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11412;
11413; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11414; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11415; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11416; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11417; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11418; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11419; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11420; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11421; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11422; GFX90A-TGSPLIT-NEXT:    s_endpgm
11423;
11424; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11425; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11426; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11427; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11428; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11429; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11430; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11431; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11432; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11433;
11434; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11435; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11436; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11437; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11438; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11439; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11440; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11441; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11442; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11443; GFX940-TGSPLIT-NEXT:    s_endpgm
11444;
11445; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11446; GFX11-WGP:       ; %bb.0: ; %entry
11447; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11448; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11449; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11450; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11451; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11452; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11453; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11454; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11455; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11456; GFX11-WGP-NEXT:    s_endpgm
11457;
11458; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11459; GFX11-CU:       ; %bb.0: ; %entry
11460; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11461; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11462; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11463; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11464; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11465; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11466; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11467; GFX11-CU-NEXT:    s_endpgm
11468;
11469; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11470; GFX12-WGP:       ; %bb.0: ; %entry
11471; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11472; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11473; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11474; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11475; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11476; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11477; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11478; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11479; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11480; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11481; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11482; GFX12-WGP-NEXT:    s_endpgm
11483;
11484; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11485; GFX12-CU:       ; %bb.0: ; %entry
11486; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11487; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11488; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11489; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11490; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11491; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11492; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11493; GFX12-CU-NEXT:    s_endpgm
11494    ptr %out, i32 %in) {
11495entry:
11496  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
11497  ret void
11498}
11499
11500define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
11501; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11502; GFX7:       ; %bb.0: ; %entry
11503; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11504; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11505; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11506; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11507; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11508; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11509; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11510; GFX7-NEXT:    s_endpgm
11511;
11512; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11513; GFX10-WGP:       ; %bb.0: ; %entry
11514; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11515; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11516; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11517; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11518; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11519; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11520; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11521; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11522; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11523; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11524; GFX10-WGP-NEXT:    buffer_gl0_inv
11525; GFX10-WGP-NEXT:    s_endpgm
11526;
11527; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11528; GFX10-CU:       ; %bb.0: ; %entry
11529; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11530; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11531; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11532; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11533; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11534; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11535; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11536; GFX10-CU-NEXT:    s_endpgm
11537;
11538; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11539; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11540; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11541; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11542; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11543; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11544; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11545; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11546; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11547; SKIP-CACHE-INV-NEXT:    s_endpgm
11548;
11549; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11550; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11552; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11553; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11554; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11555; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11556; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11557; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11558;
11559; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11560; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11561; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11562; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11563; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11564; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11565; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11566; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11567; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11568; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11569; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11570; GFX90A-TGSPLIT-NEXT:    s_endpgm
11571;
11572; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11573; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11574; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11575; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11576; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11577; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11578; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11579; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11580; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11581;
11582; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11583; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11584; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11585; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11586; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11587; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11588; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11589; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11590; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11591; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11592; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
11593; GFX940-TGSPLIT-NEXT:    s_endpgm
11594;
11595; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11596; GFX11-WGP:       ; %bb.0: ; %entry
11597; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11598; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11599; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11600; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11601; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11602; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11603; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11604; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11605; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11606; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11607; GFX11-WGP-NEXT:    buffer_gl0_inv
11608; GFX11-WGP-NEXT:    s_endpgm
11609;
11610; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11611; GFX11-CU:       ; %bb.0: ; %entry
11612; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11613; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11614; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11615; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11616; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11617; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11618; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11619; GFX11-CU-NEXT:    s_endpgm
11620;
11621; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11622; GFX12-WGP:       ; %bb.0: ; %entry
11623; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11624; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11625; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11626; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11627; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11628; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11629; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11630; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11631; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11632; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11633; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11634; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11635; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
11636; GFX12-WGP-NEXT:    s_endpgm
11637;
11638; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11639; GFX12-CU:       ; %bb.0: ; %entry
11640; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11641; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11642; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11643; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11644; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11645; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11646; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11647; GFX12-CU-NEXT:    s_endpgm
11648    ptr %out, i32 %in) {
11649entry:
11650  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
11651  ret void
11652}
11653
11654define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
11655; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11656; GFX7:       ; %bb.0: ; %entry
11657; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11658; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11659; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11660; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11661; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11662; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11663; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11664; GFX7-NEXT:    s_endpgm
11665;
11666; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11667; GFX10-WGP:       ; %bb.0: ; %entry
11668; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11669; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11670; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11671; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11672; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11673; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11674; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11675; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11676; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11677; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11678; GFX10-WGP-NEXT:    buffer_gl0_inv
11679; GFX10-WGP-NEXT:    s_endpgm
11680;
11681; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11682; GFX10-CU:       ; %bb.0: ; %entry
11683; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11684; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11685; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11686; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11687; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11688; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11689; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11690; GFX10-CU-NEXT:    s_endpgm
11691;
11692; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11693; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11694; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11695; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11696; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11697; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11698; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11699; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11700; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11701; SKIP-CACHE-INV-NEXT:    s_endpgm
11702;
11703; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11704; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11705; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11706; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11707; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11708; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11709; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11710; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11711; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11712;
11713; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11714; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11715; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11716; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11717; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11718; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11719; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11720; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11721; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11722; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11723; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11724; GFX90A-TGSPLIT-NEXT:    s_endpgm
11725;
11726; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11727; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11728; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11729; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11730; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11731; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11732; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11733; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11734; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11735;
11736; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11737; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11738; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11739; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11740; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11741; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11742; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11743; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11744; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11745; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11746; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
11747; GFX940-TGSPLIT-NEXT:    s_endpgm
11748;
11749; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11750; GFX11-WGP:       ; %bb.0: ; %entry
11751; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11752; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11753; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11754; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11755; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11756; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11757; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11758; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11759; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11760; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11761; GFX11-WGP-NEXT:    buffer_gl0_inv
11762; GFX11-WGP-NEXT:    s_endpgm
11763;
11764; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11765; GFX11-CU:       ; %bb.0: ; %entry
11766; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11767; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11768; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11769; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11770; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11771; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11772; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11773; GFX11-CU-NEXT:    s_endpgm
11774;
11775; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11776; GFX12-WGP:       ; %bb.0: ; %entry
11777; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11778; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11779; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11780; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11781; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11782; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11783; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11784; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11785; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11786; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11787; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11788; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11789; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
11790; GFX12-WGP-NEXT:    s_endpgm
11791;
11792; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11793; GFX12-CU:       ; %bb.0: ; %entry
11794; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11795; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11796; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11797; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11798; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11799; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11800; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11801; GFX12-CU-NEXT:    s_endpgm
11802    ptr %out, i32 %in) {
11803entry:
11804  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
11805  ret void
11806}
11807
11808define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
11809; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11810; GFX7:       ; %bb.0: ; %entry
11811; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11812; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
11813; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11814; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11815; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11816; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11817; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11818; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11819; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11820; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11821; GFX7-NEXT:    flat_store_dword v[0:1], v2
11822; GFX7-NEXT:    s_endpgm
11823;
11824; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11825; GFX10-WGP:       ; %bb.0: ; %entry
11826; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11827; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11828; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11829; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11830; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11831; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
11832; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11833; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11834; GFX10-WGP-NEXT:    buffer_gl0_inv
11835; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11836; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11837; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11838; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11839; GFX10-WGP-NEXT:    s_endpgm
11840;
11841; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11842; GFX10-CU:       ; %bb.0: ; %entry
11843; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11844; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11845; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11846; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11847; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11848; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
11849; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11850; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11851; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11852; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11853; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11854; GFX10-CU-NEXT:    s_endpgm
11855;
11856; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11857; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11858; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11859; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
11860; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11862; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11863; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
11864; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11865; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11866; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11867; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11868; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11869; SKIP-CACHE-INV-NEXT:    s_endpgm
11870;
11871; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11872; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11873; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11874; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11875; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11876; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11877; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11878; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11879; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11880; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11881; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11882; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11883;
11884; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11885; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11886; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11887; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11888; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11889; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11890; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11891; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11892; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11893; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11894; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11895; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11896; GFX90A-TGSPLIT-NEXT:    s_endpgm
11897;
11898; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11899; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11900; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11901; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11902; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11903; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11904; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11905; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11906; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11907; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11908; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11909; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11910;
11911; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11912; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11913; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11914; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11915; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11916; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11917; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11918; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11919; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11920; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
11921; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11922; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11923; GFX940-TGSPLIT-NEXT:    s_endpgm
11924;
11925; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11926; GFX11-WGP:       ; %bb.0: ; %entry
11927; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11928; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11929; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11930; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11931; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11932; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
11933; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11934; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11935; GFX11-WGP-NEXT:    buffer_gl0_inv
11936; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11937; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11938; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11939; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11940; GFX11-WGP-NEXT:    s_endpgm
11941;
11942; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11943; GFX11-CU:       ; %bb.0: ; %entry
11944; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11945; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11946; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11947; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11948; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11949; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
11950; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11951; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11952; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11953; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11954; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11955; GFX11-CU-NEXT:    s_endpgm
11956;
11957; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11958; GFX12-WGP:       ; %bb.0: ; %entry
11959; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11960; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11961; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11962; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11963; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11964; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
11965; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
11966; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11967; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
11968; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11969; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11970; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
11971; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11972; GFX12-WGP-NEXT:    s_endpgm
11973;
11974; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11975; GFX12-CU:       ; %bb.0: ; %entry
11976; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11977; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11978; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11979; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11980; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11981; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
11982; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
11983; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11984; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11985; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11986; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11987; GFX12-CU-NEXT:    s_endpgm
11988    ptr %out, i32 %in) {
11989entry:
11990  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
11991  store i32 %val, ptr %out, align 4
11992  ret void
11993}
11994
11995define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
11996; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
11997; GFX7:       ; %bb.0: ; %entry
11998; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11999; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
12000; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12001; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12002; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12003; GFX7-NEXT:    v_mov_b32_e32 v2, s6
12004; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12005; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12006; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12007; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12008; GFX7-NEXT:    flat_store_dword v[0:1], v2
12009; GFX7-NEXT:    s_endpgm
12010;
12011; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12012; GFX10-WGP:       ; %bb.0: ; %entry
12013; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12014; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
12015; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12016; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12017; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12018; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
12019; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12020; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12021; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12022; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12023; GFX10-WGP-NEXT:    buffer_gl0_inv
12024; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12025; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12026; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12027; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12028; GFX10-WGP-NEXT:    s_endpgm
12029;
12030; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12031; GFX10-CU:       ; %bb.0: ; %entry
12032; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12033; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
12034; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12035; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12036; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12037; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
12038; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12039; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12040; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12041; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12042; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12043; GFX10-CU-NEXT:    s_endpgm
12044;
12045; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12046; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12047; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12048; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
12049; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12050; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12051; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
12053; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12054; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12056; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12057; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12058; SKIP-CACHE-INV-NEXT:    s_endpgm
12059;
12060; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12061; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12062; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12063; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12064; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12065; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12066; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12067; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12068; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12069; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12070; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12071; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12072;
12073; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12074; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12075; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12076; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12077; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12078; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12079; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12080; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12081; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12082; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12083; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
12084; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12085; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12086; GFX90A-TGSPLIT-NEXT:    s_endpgm
12087;
12088; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12089; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12090; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12091; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12092; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12093; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12094; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12095; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12096; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12097; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12098; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12099; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12100;
12101; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12102; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12103; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12104; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12105; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12106; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12107; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12108; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12109; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12110; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12111; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
12112; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12113; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12114; GFX940-TGSPLIT-NEXT:    s_endpgm
12115;
12116; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12117; GFX11-WGP:       ; %bb.0: ; %entry
12118; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12119; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12120; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12121; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12122; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12123; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
12124; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12125; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12126; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12127; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12128; GFX11-WGP-NEXT:    buffer_gl0_inv
12129; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12130; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12131; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12132; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12133; GFX11-WGP-NEXT:    s_endpgm
12134;
12135; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12136; GFX11-CU:       ; %bb.0: ; %entry
12137; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12138; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12139; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12140; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12141; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12142; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
12143; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12144; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12145; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12146; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12147; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12148; GFX11-CU-NEXT:    s_endpgm
12149;
12150; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12151; GFX12-WGP:       ; %bb.0: ; %entry
12152; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12153; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12154; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12155; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12156; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12157; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
12158; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12159; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12160; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12161; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12162; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
12163; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12164; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12165; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12166; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
12167; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12168; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12169; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
12170; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12171; GFX12-WGP-NEXT:    s_endpgm
12172;
12173; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12174; GFX12-CU:       ; %bb.0: ; %entry
12175; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12176; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12177; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12178; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12179; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12180; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
12181; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12182; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12183; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12184; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
12185; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12186; GFX12-CU-NEXT:    s_endpgm
12187    ptr %out, i32 %in) {
12188entry:
12189  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
12190  store i32 %val, ptr %out, align 4
12191  ret void
12192}
12193
12194define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
12195; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12196; GFX7:       ; %bb.0: ; %entry
12197; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12198; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
12199; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12200; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12201; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12202; GFX7-NEXT:    v_mov_b32_e32 v2, s6
12203; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12204; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12205; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12206; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12207; GFX7-NEXT:    flat_store_dword v[0:1], v2
12208; GFX7-NEXT:    s_endpgm
12209;
12210; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12211; GFX10-WGP:       ; %bb.0: ; %entry
12212; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12213; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
12214; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12215; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12216; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12217; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
12218; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12219; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12220; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12221; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12222; GFX10-WGP-NEXT:    buffer_gl0_inv
12223; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12224; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12225; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12226; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12227; GFX10-WGP-NEXT:    s_endpgm
12228;
12229; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12230; GFX10-CU:       ; %bb.0: ; %entry
12231; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12232; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
12233; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12234; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12235; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12236; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
12237; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12238; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12239; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12240; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12241; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12242; GFX10-CU-NEXT:    s_endpgm
12243;
12244; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12245; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12246; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12247; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
12248; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
12252; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12253; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12255; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12256; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12257; SKIP-CACHE-INV-NEXT:    s_endpgm
12258;
12259; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12260; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12262; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12263; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12264; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12265; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12266; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12267; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12268; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12269; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12270; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12271;
12272; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12273; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12275; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12276; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12277; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12278; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12279; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12280; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12281; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12282; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
12283; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12284; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12285; GFX90A-TGSPLIT-NEXT:    s_endpgm
12286;
12287; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12288; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12289; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12290; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12291; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12292; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12293; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12294; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12295; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12296; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12297; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12298; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12299;
12300; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12301; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12302; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12303; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12304; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12305; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12306; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12307; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12308; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12309; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12310; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
12311; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12312; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12313; GFX940-TGSPLIT-NEXT:    s_endpgm
12314;
12315; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12316; GFX11-WGP:       ; %bb.0: ; %entry
12317; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12318; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12319; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12320; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12321; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12322; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
12323; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12324; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12325; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12326; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12327; GFX11-WGP-NEXT:    buffer_gl0_inv
12328; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12329; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12330; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12331; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12332; GFX11-WGP-NEXT:    s_endpgm
12333;
12334; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12335; GFX11-CU:       ; %bb.0: ; %entry
12336; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12337; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12338; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12339; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12340; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12341; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
12342; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12343; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12344; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12345; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12346; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12347; GFX11-CU-NEXT:    s_endpgm
12348;
12349; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12350; GFX12-WGP:       ; %bb.0: ; %entry
12351; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12352; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12353; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12354; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12355; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12356; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
12357; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12358; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12359; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12360; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12361; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
12362; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12363; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12364; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12365; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
12366; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12367; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12368; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
12369; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12370; GFX12-WGP-NEXT:    s_endpgm
12371;
12372; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12373; GFX12-CU:       ; %bb.0: ; %entry
12374; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12375; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12376; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12377; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12378; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12379; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
12380; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12381; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12382; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12383; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
12384; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12385; GFX12-CU-NEXT:    s_endpgm
12386    ptr %out, i32 %in) {
12387entry:
12388  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
12389  store i32 %val, ptr %out, align 4
12390  ret void
12391}
12392
12393define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
12394; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12395; GFX7:       ; %bb.0: ; %entry
12396; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12397; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12398; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12399; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12400; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12401; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12402; GFX7-NEXT:    s_mov_b32 s4, s8
12403; GFX7-NEXT:    s_mov_b32 s5, s9
12404; GFX7-NEXT:    s_mov_b32 s9, s10
12405; GFX7-NEXT:    s_mov_b32 s8, s11
12406; GFX7-NEXT:    s_add_u32 s4, s4, s9
12407; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12408; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12409; GFX7-NEXT:    s_mov_b32 s5, s8
12410; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12411; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12412; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12413; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12414; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12415; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12416; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12417; GFX7-NEXT:    s_endpgm
12418;
12419; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12420; GFX10-WGP:       ; %bb.0: ; %entry
12421; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12422; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12423; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12424; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12425; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12426; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12427; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12428; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12429; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12430; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12431; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12432; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12433; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12434; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12435; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12436; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12437; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12438; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12439; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12440; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12441; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12442; GFX10-WGP-NEXT:    s_endpgm
12443;
12444; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12445; GFX10-CU:       ; %bb.0: ; %entry
12446; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12447; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12448; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12449; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12450; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12451; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12452; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12453; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12454; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12455; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12456; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12457; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12458; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12459; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12460; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12461; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12462; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12463; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12464; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12465; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12466; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12467; GFX10-CU-NEXT:    s_endpgm
12468;
12469; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12470; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12471; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12472; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12473; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12474; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12475; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12476; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12477; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12478; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12479; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12481; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12482; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12483; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12484; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12487; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12491; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12492; SKIP-CACHE-INV-NEXT:    s_endpgm
12493;
12494; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12495; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12496; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12497; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12498; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12500; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12501; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12502; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12503; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12504; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12505; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12506; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12507;
12508; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12509; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12510; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12511; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12512; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12513; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12514; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12515; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12516; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12517; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12518; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12519; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12520; GFX90A-TGSPLIT-NEXT:    s_endpgm
12521;
12522; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12523; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12524; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12525; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12526; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12527; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12528; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12529; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12530; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12531; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12532; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12533; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12534; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12535;
12536; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12537; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12538; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12539; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12540; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12541; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12542; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12543; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12544; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12545; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12546; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12547; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12548; GFX940-TGSPLIT-NEXT:    s_endpgm
12549;
12550; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12551; GFX11-WGP:       ; %bb.0: ; %entry
12552; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12553; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12554; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12555; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12556; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
12557; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12558; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12559; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
12560; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12561; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12562; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12563; GFX11-WGP-NEXT:    s_endpgm
12564;
12565; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12566; GFX11-CU:       ; %bb.0: ; %entry
12567; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12568; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12569; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12570; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12571; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
12572; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12573; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12574; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
12575; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12576; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12577; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12578; GFX11-CU-NEXT:    s_endpgm
12579;
12580; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12581; GFX12-WGP:       ; %bb.0: ; %entry
12582; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12583; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12584; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12585; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12586; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
12587; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12588; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12589; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
12590; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12591; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12592; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
12593; GFX12-WGP-NEXT:    s_endpgm
12594;
12595; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12596; GFX12-CU:       ; %bb.0: ; %entry
12597; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12598; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12599; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12600; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12601; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
12602; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12603; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12604; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
12605; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12606; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12607; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12608; GFX12-CU-NEXT:    s_endpgm
12609    ptr %out, i32 %in, i32 %old) {
12610entry:
12611  %gep = getelementptr i32, ptr %out, i32 4
12612  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
12613  ret void
12614}
12615
12616define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
12617; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12618; GFX7:       ; %bb.0: ; %entry
12619; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12620; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12621; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12622; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12623; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12624; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12625; GFX7-NEXT:    s_mov_b32 s4, s8
12626; GFX7-NEXT:    s_mov_b32 s5, s9
12627; GFX7-NEXT:    s_mov_b32 s9, s10
12628; GFX7-NEXT:    s_mov_b32 s8, s11
12629; GFX7-NEXT:    s_add_u32 s4, s4, s9
12630; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12631; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12632; GFX7-NEXT:    s_mov_b32 s5, s8
12633; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12634; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12635; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12636; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12637; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12638; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12639; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12640; GFX7-NEXT:    s_endpgm
12641;
12642; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12643; GFX10-WGP:       ; %bb.0: ; %entry
12644; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12645; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12646; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12647; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12648; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12649; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12650; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12651; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12652; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12653; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12654; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12655; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12656; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12657; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12658; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12659; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12660; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12661; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12662; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12663; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12664; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12665; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12666; GFX10-WGP-NEXT:    buffer_gl0_inv
12667; GFX10-WGP-NEXT:    s_endpgm
12668;
12669; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12670; GFX10-CU:       ; %bb.0: ; %entry
12671; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12672; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12673; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12674; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12675; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12676; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12677; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12678; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12679; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12680; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12681; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12682; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12683; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12684; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12685; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12686; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12687; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12688; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12689; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12690; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12691; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12692; GFX10-CU-NEXT:    s_endpgm
12693;
12694; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12695; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12696; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12697; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12698; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12699; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12700; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12701; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12702; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12703; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12704; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12705; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12706; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12707; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12708; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12709; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12710; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12711; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12712; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12716; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12717; SKIP-CACHE-INV-NEXT:    s_endpgm
12718;
12719; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12720; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12721; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12722; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12723; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12724; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12725; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12726; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12727; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12728; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12729; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12730; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12731; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12732;
12733; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12734; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12735; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12736; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12737; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12738; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12739; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12740; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12741; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12742; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12743; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12744; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12745; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12746; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
12747; GFX90A-TGSPLIT-NEXT:    s_endpgm
12748;
12749; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12750; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12751; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12752; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12753; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12754; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12755; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12756; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12757; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12758; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12759; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12760; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12761; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12762;
12763; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12764; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12765; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12766; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12767; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12768; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12769; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12770; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12771; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12772; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12773; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12774; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12775; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12776; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
12777; GFX940-TGSPLIT-NEXT:    s_endpgm
12778;
12779; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12780; GFX11-WGP:       ; %bb.0: ; %entry
12781; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12782; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12783; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12784; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12785; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
12786; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12787; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12788; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
12789; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12790; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12791; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12792; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12793; GFX11-WGP-NEXT:    buffer_gl0_inv
12794; GFX11-WGP-NEXT:    s_endpgm
12795;
12796; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12797; GFX11-CU:       ; %bb.0: ; %entry
12798; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12799; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12800; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12801; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12802; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
12803; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12804; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12805; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
12806; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12807; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12808; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12809; GFX11-CU-NEXT:    s_endpgm
12810;
12811; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12812; GFX12-WGP:       ; %bb.0: ; %entry
12813; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12814; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12815; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12816; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12817; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
12818; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12819; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12820; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
12821; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12822; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12823; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
12824; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12825; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
12826; GFX12-WGP-NEXT:    s_endpgm
12827;
12828; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12829; GFX12-CU:       ; %bb.0: ; %entry
12830; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12831; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12832; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12833; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12834; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
12835; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12836; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12837; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
12838; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12839; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12840; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12841; GFX12-CU-NEXT:    s_endpgm
12842    ptr %out, i32 %in, i32 %old) {
12843entry:
12844  %gep = getelementptr i32, ptr %out, i32 4
12845  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
12846  ret void
12847}
12848
12849define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
12850; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12851; GFX7:       ; %bb.0: ; %entry
12852; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12853; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12854; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12855; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12856; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12857; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12858; GFX7-NEXT:    s_mov_b32 s4, s8
12859; GFX7-NEXT:    s_mov_b32 s5, s9
12860; GFX7-NEXT:    s_mov_b32 s9, s10
12861; GFX7-NEXT:    s_mov_b32 s8, s11
12862; GFX7-NEXT:    s_add_u32 s4, s4, s9
12863; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12864; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12865; GFX7-NEXT:    s_mov_b32 s5, s8
12866; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12867; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12868; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12869; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12870; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12871; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12872; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12873; GFX7-NEXT:    s_endpgm
12874;
12875; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12876; GFX10-WGP:       ; %bb.0: ; %entry
12877; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12878; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12879; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12880; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12881; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12882; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12883; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12884; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12885; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12886; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12887; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12888; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12889; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12890; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12891; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12892; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12893; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12894; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12895; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12896; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12897; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12898; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12899; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12900; GFX10-WGP-NEXT:    s_endpgm
12901;
12902; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12903; GFX10-CU:       ; %bb.0: ; %entry
12904; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12905; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12906; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12907; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12908; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12909; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12910; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12911; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12912; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12913; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12914; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12915; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12916; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12917; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12918; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12919; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12920; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12921; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12922; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12923; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12924; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12925; GFX10-CU-NEXT:    s_endpgm
12926;
12927; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12928; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12929; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12930; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12931; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12932; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12933; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12934; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12935; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12936; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12937; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12938; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12939; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12940; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12941; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12942; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12943; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12944; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12945; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12946; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12947; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12948; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12949; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12950; SKIP-CACHE-INV-NEXT:    s_endpgm
12951;
12952; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12953; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12954; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12955; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12956; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12957; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12958; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12959; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12960; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12961; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12962; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12963; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12964; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12965;
12966; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12967; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12968; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12969; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12970; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12971; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12972; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12973; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12974; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12975; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12976; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12977; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12978; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12979; GFX90A-TGSPLIT-NEXT:    s_endpgm
12980;
12981; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12982; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12983; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12984; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12985; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12986; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12987; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12988; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12989; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12990; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12991; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12992; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12993; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12994;
12995; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12996; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12997; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12998; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12999; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13000; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13001; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13002; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13003; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13004; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13005; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13006; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13007; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13008; GFX940-TGSPLIT-NEXT:    s_endpgm
13009;
13010; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13011; GFX11-WGP:       ; %bb.0: ; %entry
13012; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13013; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13014; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13015; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13016; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13017; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13018; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13019; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13020; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13021; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13022; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13023; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13024; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13025; GFX11-WGP-NEXT:    s_endpgm
13026;
13027; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13028; GFX11-CU:       ; %bb.0: ; %entry
13029; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13030; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13031; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13032; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13033; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13034; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13035; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13036; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13037; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13038; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13039; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13040; GFX11-CU-NEXT:    s_endpgm
13041;
13042; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13043; GFX12-WGP:       ; %bb.0: ; %entry
13044; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13045; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13046; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13047; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13048; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13049; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13050; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13051; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13052; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13053; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13054; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13055; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13056; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13057; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13058; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13059; GFX12-WGP-NEXT:    s_endpgm
13060;
13061; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13062; GFX12-CU:       ; %bb.0: ; %entry
13063; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13064; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13065; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13066; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13067; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13068; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13069; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13070; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13071; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13072; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13073; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13074; GFX12-CU-NEXT:    s_endpgm
13075    ptr %out, i32 %in, i32 %old) {
13076entry:
13077  %gep = getelementptr i32, ptr %out, i32 4
13078  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
13079  ret void
13080}
13081
13082define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
13083; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13084; GFX7:       ; %bb.0: ; %entry
13085; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13086; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13087; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13088; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13089; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13090; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13091; GFX7-NEXT:    s_mov_b32 s4, s8
13092; GFX7-NEXT:    s_mov_b32 s5, s9
13093; GFX7-NEXT:    s_mov_b32 s9, s10
13094; GFX7-NEXT:    s_mov_b32 s8, s11
13095; GFX7-NEXT:    s_add_u32 s4, s4, s9
13096; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13097; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13098; GFX7-NEXT:    s_mov_b32 s5, s8
13099; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13100; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13101; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13102; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13103; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13104; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13105; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13106; GFX7-NEXT:    s_endpgm
13107;
13108; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13109; GFX10-WGP:       ; %bb.0: ; %entry
13110; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13111; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13112; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13113; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13114; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13115; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13116; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13117; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13118; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13119; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13120; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13121; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13122; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13123; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13124; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13125; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13126; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13127; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13128; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13129; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13130; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13131; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13132; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13133; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13134; GFX10-WGP-NEXT:    buffer_gl0_inv
13135; GFX10-WGP-NEXT:    s_endpgm
13136;
13137; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13138; GFX10-CU:       ; %bb.0: ; %entry
13139; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13140; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13141; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13142; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13143; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13144; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13145; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13146; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13147; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13148; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13149; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13150; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13151; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13152; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13153; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13154; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13155; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13156; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13157; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13158; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13159; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13160; GFX10-CU-NEXT:    s_endpgm
13161;
13162; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13163; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13164; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13165; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13166; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13167; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13168; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13169; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13170; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13171; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13172; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13173; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13174; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13175; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13176; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13177; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13178; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13179; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13180; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13182; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13184; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13185; SKIP-CACHE-INV-NEXT:    s_endpgm
13186;
13187; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13188; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13189; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13190; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13191; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13192; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13193; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13194; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13195; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13196; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13197; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13198; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13199; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13200;
13201; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13202; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13203; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13204; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13205; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13206; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13207; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13208; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13209; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13210; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13211; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13212; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13213; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13214; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13215; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13216; GFX90A-TGSPLIT-NEXT:    s_endpgm
13217;
13218; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13219; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13220; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13221; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13222; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13223; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13224; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13225; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13226; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13227; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13228; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13229; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13230; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13231;
13232; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13233; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13234; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13235; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13236; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13237; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13238; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13239; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13240; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13241; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13242; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13243; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13244; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13245; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13246; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
13247; GFX940-TGSPLIT-NEXT:    s_endpgm
13248;
13249; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13250; GFX11-WGP:       ; %bb.0: ; %entry
13251; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13252; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13253; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13254; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13255; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13256; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13257; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13258; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13259; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13260; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13261; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13262; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13263; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13264; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13265; GFX11-WGP-NEXT:    buffer_gl0_inv
13266; GFX11-WGP-NEXT:    s_endpgm
13267;
13268; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13269; GFX11-CU:       ; %bb.0: ; %entry
13270; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13271; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13272; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13273; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13274; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13275; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13276; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13277; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13278; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13279; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13280; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13281; GFX11-CU-NEXT:    s_endpgm
13282;
13283; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13284; GFX12-WGP:       ; %bb.0: ; %entry
13285; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13286; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13287; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13288; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13289; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13290; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13291; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13292; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13293; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13294; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13295; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13296; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13297; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13298; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13299; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13300; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13301; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
13302; GFX12-WGP-NEXT:    s_endpgm
13303;
13304; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13305; GFX12-CU:       ; %bb.0: ; %entry
13306; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13307; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13308; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13309; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13310; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13311; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13312; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13313; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13314; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13315; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13316; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13317; GFX12-CU-NEXT:    s_endpgm
13318    ptr %out, i32 %in, i32 %old) {
13319entry:
13320  %gep = getelementptr i32, ptr %out, i32 4
13321  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
13322  ret void
13323}
13324
13325define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
13326; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13327; GFX7:       ; %bb.0: ; %entry
13328; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13329; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13330; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13331; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13332; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13333; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13334; GFX7-NEXT:    s_mov_b32 s4, s8
13335; GFX7-NEXT:    s_mov_b32 s5, s9
13336; GFX7-NEXT:    s_mov_b32 s9, s10
13337; GFX7-NEXT:    s_mov_b32 s8, s11
13338; GFX7-NEXT:    s_add_u32 s4, s4, s9
13339; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13340; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13341; GFX7-NEXT:    s_mov_b32 s5, s8
13342; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13343; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13344; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13345; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13346; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13347; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13348; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13349; GFX7-NEXT:    s_endpgm
13350;
13351; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13352; GFX10-WGP:       ; %bb.0: ; %entry
13353; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13354; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13355; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13356; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13357; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13358; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13359; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13360; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13361; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13362; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13363; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13364; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13365; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13366; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13367; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13368; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13369; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13370; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13371; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13372; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13373; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13374; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13375; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13376; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13377; GFX10-WGP-NEXT:    buffer_gl0_inv
13378; GFX10-WGP-NEXT:    s_endpgm
13379;
13380; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13381; GFX10-CU:       ; %bb.0: ; %entry
13382; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13383; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13384; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13385; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13386; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13387; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13388; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13389; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13390; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13391; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13392; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13393; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13394; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13395; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13396; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13397; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13398; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13399; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13400; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13401; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13402; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13403; GFX10-CU-NEXT:    s_endpgm
13404;
13405; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13406; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13407; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13408; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13409; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13410; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13411; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13412; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13413; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13414; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13415; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13416; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13417; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13418; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13419; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13420; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13421; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13422; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13423; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13424; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13425; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13426; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13427; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13428; SKIP-CACHE-INV-NEXT:    s_endpgm
13429;
13430; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13431; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13432; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13433; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13434; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13435; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13436; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13437; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13438; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13439; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13440; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13441; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13442; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13443;
13444; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13445; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13446; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13447; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13448; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13449; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13450; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13451; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13452; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13453; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13454; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13455; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13456; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13457; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13458; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13459; GFX90A-TGSPLIT-NEXT:    s_endpgm
13460;
13461; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13462; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13463; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13464; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13465; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13466; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13467; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13468; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13469; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13470; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13471; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13472; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13473; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13474;
13475; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13476; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13477; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13478; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13479; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13480; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13481; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13482; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13483; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13484; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13485; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13486; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13487; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13488; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13489; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
13490; GFX940-TGSPLIT-NEXT:    s_endpgm
13491;
13492; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13493; GFX11-WGP:       ; %bb.0: ; %entry
13494; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13495; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13496; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13497; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13498; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13499; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13500; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13501; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13502; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13503; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13504; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13505; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13506; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13507; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13508; GFX11-WGP-NEXT:    buffer_gl0_inv
13509; GFX11-WGP-NEXT:    s_endpgm
13510;
13511; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13512; GFX11-CU:       ; %bb.0: ; %entry
13513; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13514; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13515; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13516; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13517; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13518; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13519; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13520; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13521; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13522; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13523; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13524; GFX11-CU-NEXT:    s_endpgm
13525;
13526; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13527; GFX12-WGP:       ; %bb.0: ; %entry
13528; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13529; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13530; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13531; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13532; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13533; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13534; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13535; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13536; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13537; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13538; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13539; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13540; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13541; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13542; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13543; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13544; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
13545; GFX12-WGP-NEXT:    s_endpgm
13546;
13547; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13548; GFX12-CU:       ; %bb.0: ; %entry
13549; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13550; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13551; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13552; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13553; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13554; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13555; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13556; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13557; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13558; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13559; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13560; GFX12-CU-NEXT:    s_endpgm
13561    ptr %out, i32 %in, i32 %old) {
13562entry:
13563  %gep = getelementptr i32, ptr %out, i32 4
13564  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
13565  ret void
13566}
13567
13568define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
13569; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13570; GFX7:       ; %bb.0: ; %entry
13571; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13572; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13573; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13574; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13575; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13576; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13577; GFX7-NEXT:    s_mov_b32 s4, s8
13578; GFX7-NEXT:    s_mov_b32 s5, s9
13579; GFX7-NEXT:    s_mov_b32 s9, s10
13580; GFX7-NEXT:    s_mov_b32 s8, s11
13581; GFX7-NEXT:    s_add_u32 s4, s4, s9
13582; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13583; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13584; GFX7-NEXT:    s_mov_b32 s5, s8
13585; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13586; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13587; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13588; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13589; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13590; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13591; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13592; GFX7-NEXT:    s_endpgm
13593;
13594; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13595; GFX10-WGP:       ; %bb.0: ; %entry
13596; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13597; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13598; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13599; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13600; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13601; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13602; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13603; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13604; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13605; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13606; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13607; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13608; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13609; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13610; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13611; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13612; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13613; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13614; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13615; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13616; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13617; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13618; GFX10-WGP-NEXT:    buffer_gl0_inv
13619; GFX10-WGP-NEXT:    s_endpgm
13620;
13621; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13622; GFX10-CU:       ; %bb.0: ; %entry
13623; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13624; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13625; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13626; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13627; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13629; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13630; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13631; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13632; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13633; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13634; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13635; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13636; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13637; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13638; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13639; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13640; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13641; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13642; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13643; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13644; GFX10-CU-NEXT:    s_endpgm
13645;
13646; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13647; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13648; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13649; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13650; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13651; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13652; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13653; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13654; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13655; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13656; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13657; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13658; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13659; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13660; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13661; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13662; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13663; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13664; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13665; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13666; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13667; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13668; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13669; SKIP-CACHE-INV-NEXT:    s_endpgm
13670;
13671; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13672; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13673; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13674; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13675; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13676; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13677; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13678; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13679; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13680; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13681; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13682; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13683; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13684;
13685; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13686; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13687; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13688; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13689; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13690; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13691; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13692; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13693; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13694; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13695; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13696; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13697; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13698; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13699; GFX90A-TGSPLIT-NEXT:    s_endpgm
13700;
13701; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13702; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13703; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13704; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13705; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13706; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13707; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13708; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13709; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13710; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13711; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13712; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13713; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13714;
13715; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13716; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13717; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13718; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13719; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13720; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13721; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13722; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13723; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13724; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13725; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13726; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13727; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13728; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
13729; GFX940-TGSPLIT-NEXT:    s_endpgm
13730;
13731; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13732; GFX11-WGP:       ; %bb.0: ; %entry
13733; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13734; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13735; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13736; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13737; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13738; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13739; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13740; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13741; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13742; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13743; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13744; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13745; GFX11-WGP-NEXT:    buffer_gl0_inv
13746; GFX11-WGP-NEXT:    s_endpgm
13747;
13748; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13749; GFX11-CU:       ; %bb.0: ; %entry
13750; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13751; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13752; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13753; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13754; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13755; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13756; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13757; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13758; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13759; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13760; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13761; GFX11-CU-NEXT:    s_endpgm
13762;
13763; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13764; GFX12-WGP:       ; %bb.0: ; %entry
13765; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13766; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13767; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13768; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13769; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13770; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13771; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13772; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13773; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13774; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13775; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13776; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13777; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
13778; GFX12-WGP-NEXT:    s_endpgm
13779;
13780; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13781; GFX12-CU:       ; %bb.0: ; %entry
13782; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13783; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13784; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13785; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13786; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13787; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13788; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13789; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13790; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13791; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13792; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13793; GFX12-CU-NEXT:    s_endpgm
13794    ptr %out, i32 %in, i32 %old) {
13795entry:
13796  %gep = getelementptr i32, ptr %out, i32 4
13797  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
13798  ret void
13799}
13800
13801define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
13802; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13803; GFX7:       ; %bb.0: ; %entry
13804; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13805; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13806; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13807; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13808; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13809; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13810; GFX7-NEXT:    s_mov_b32 s4, s8
13811; GFX7-NEXT:    s_mov_b32 s5, s9
13812; GFX7-NEXT:    s_mov_b32 s9, s10
13813; GFX7-NEXT:    s_mov_b32 s8, s11
13814; GFX7-NEXT:    s_add_u32 s4, s4, s9
13815; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13816; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13817; GFX7-NEXT:    s_mov_b32 s5, s8
13818; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13819; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13820; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13821; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13822; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13823; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13824; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13825; GFX7-NEXT:    s_endpgm
13826;
13827; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13828; GFX10-WGP:       ; %bb.0: ; %entry
13829; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13830; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13831; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13832; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13833; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13834; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13835; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13836; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13837; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13838; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13839; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13840; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13841; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13842; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13843; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13844; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13845; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13846; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13847; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13848; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13849; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13850; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13851; GFX10-WGP-NEXT:    buffer_gl0_inv
13852; GFX10-WGP-NEXT:    s_endpgm
13853;
13854; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13855; GFX10-CU:       ; %bb.0: ; %entry
13856; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13857; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13858; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13859; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13860; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13861; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13862; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13863; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13864; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13865; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13866; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13867; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13868; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13869; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13870; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13871; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13872; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13873; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13874; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13875; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13876; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13877; GFX10-CU-NEXT:    s_endpgm
13878;
13879; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13880; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13881; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13882; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13883; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13884; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13885; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13886; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13887; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13888; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13889; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13890; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13891; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13892; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13893; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13894; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13895; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13896; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13897; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13900; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13901; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13902; SKIP-CACHE-INV-NEXT:    s_endpgm
13903;
13904; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13905; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13906; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13907; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13909; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13910; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13911; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13912; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13913; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13914; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13915; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13916; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13917;
13918; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13919; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13920; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13921; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13922; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13923; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13924; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13925; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13926; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13927; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13928; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13929; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13930; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13931; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13932; GFX90A-TGSPLIT-NEXT:    s_endpgm
13933;
13934; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13935; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13936; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13937; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13938; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13939; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13940; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13941; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13942; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13943; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13944; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13945; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13946; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13947;
13948; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13949; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13950; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13951; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13952; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13953; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13954; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13955; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13956; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13957; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13958; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13959; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13960; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13961; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
13962; GFX940-TGSPLIT-NEXT:    s_endpgm
13963;
13964; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13965; GFX11-WGP:       ; %bb.0: ; %entry
13966; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13967; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13968; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13969; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13970; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13971; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13972; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13973; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13974; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13975; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13976; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13977; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13978; GFX11-WGP-NEXT:    buffer_gl0_inv
13979; GFX11-WGP-NEXT:    s_endpgm
13980;
13981; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13982; GFX11-CU:       ; %bb.0: ; %entry
13983; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13984; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13985; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13986; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13987; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13988; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13989; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13990; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13991; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13992; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13993; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13994; GFX11-CU-NEXT:    s_endpgm
13995;
13996; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13997; GFX12-WGP:       ; %bb.0: ; %entry
13998; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13999; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14000; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14001; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14002; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14003; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14004; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14005; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14006; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14007; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14008; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14009; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14010; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
14011; GFX12-WGP-NEXT:    s_endpgm
14012;
14013; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
14014; GFX12-CU:       ; %bb.0: ; %entry
14015; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14016; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14017; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14018; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14019; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14020; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14021; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14022; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14023; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14024; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14025; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14026; GFX12-CU-NEXT:    s_endpgm
14027    ptr %out, i32 %in, i32 %old) {
14028entry:
14029  %gep = getelementptr i32, ptr %out, i32 4
14030  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
14031  ret void
14032}
14033
14034define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
14035; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14036; GFX7:       ; %bb.0: ; %entry
14037; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14038; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14039; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14040; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14041; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14042; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14043; GFX7-NEXT:    s_mov_b32 s4, s8
14044; GFX7-NEXT:    s_mov_b32 s5, s9
14045; GFX7-NEXT:    s_mov_b32 s9, s10
14046; GFX7-NEXT:    s_mov_b32 s8, s11
14047; GFX7-NEXT:    s_add_u32 s4, s4, s9
14048; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14049; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14050; GFX7-NEXT:    s_mov_b32 s5, s8
14051; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14052; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14053; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14054; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14055; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14056; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14057; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14058; GFX7-NEXT:    s_endpgm
14059;
14060; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14061; GFX10-WGP:       ; %bb.0: ; %entry
14062; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14063; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14064; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14065; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14066; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14067; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14068; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14069; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14070; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14071; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14072; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14073; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14074; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14075; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14076; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14077; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14078; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14079; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14080; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14081; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14082; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14083; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14084; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14085; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14086; GFX10-WGP-NEXT:    buffer_gl0_inv
14087; GFX10-WGP-NEXT:    s_endpgm
14088;
14089; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14090; GFX10-CU:       ; %bb.0: ; %entry
14091; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14092; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14093; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14094; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14095; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14096; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14097; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14098; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14099; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14100; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14101; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14102; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14103; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14104; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14105; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14106; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14107; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14108; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14109; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14110; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14111; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14112; GFX10-CU-NEXT:    s_endpgm
14113;
14114; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14115; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14116; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14117; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14118; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14119; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14120; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14121; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14122; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14123; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14124; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14125; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14126; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14127; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14128; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14129; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14130; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14131; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14132; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14133; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14134; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14135; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14136; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14137; SKIP-CACHE-INV-NEXT:    s_endpgm
14138;
14139; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14140; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14141; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14142; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14143; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14144; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14145; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14146; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14147; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14148; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14149; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14150; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14151; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14152;
14153; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14154; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14155; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14156; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14157; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14158; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14159; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14160; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14161; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14162; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14163; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14164; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14165; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14166; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14167; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14168; GFX90A-TGSPLIT-NEXT:    s_endpgm
14169;
14170; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14171; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14172; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14173; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14174; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14175; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14176; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14177; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14178; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14179; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14180; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14181; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14182; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14183;
14184; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14185; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14186; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14187; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14188; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14189; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14190; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14191; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14192; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14193; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14194; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14195; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14196; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14197; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14198; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
14199; GFX940-TGSPLIT-NEXT:    s_endpgm
14200;
14201; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14202; GFX11-WGP:       ; %bb.0: ; %entry
14203; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14204; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14205; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14206; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14207; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14208; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14209; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14210; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14211; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14212; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14213; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14214; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14215; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14216; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14217; GFX11-WGP-NEXT:    buffer_gl0_inv
14218; GFX11-WGP-NEXT:    s_endpgm
14219;
14220; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14221; GFX11-CU:       ; %bb.0: ; %entry
14222; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14223; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14224; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14225; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14226; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14227; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14228; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14229; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14230; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14231; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14232; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14233; GFX11-CU-NEXT:    s_endpgm
14234;
14235; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14236; GFX12-WGP:       ; %bb.0: ; %entry
14237; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14238; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14239; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14240; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14241; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14242; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14243; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14244; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14245; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14246; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14247; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14248; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14249; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14250; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14251; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14252; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14253; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
14254; GFX12-WGP-NEXT:    s_endpgm
14255;
14256; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14257; GFX12-CU:       ; %bb.0: ; %entry
14258; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14259; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14260; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14261; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14262; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14263; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14264; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14265; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14266; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14267; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14268; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14269; GFX12-CU-NEXT:    s_endpgm
14270    ptr %out, i32 %in, i32 %old) {
14271entry:
14272  %gep = getelementptr i32, ptr %out, i32 4
14273  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
14274  ret void
14275}
14276
14277define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
14278; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14279; GFX7:       ; %bb.0: ; %entry
14280; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14281; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14282; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14283; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14284; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14285; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14286; GFX7-NEXT:    s_mov_b32 s4, s8
14287; GFX7-NEXT:    s_mov_b32 s5, s9
14288; GFX7-NEXT:    s_mov_b32 s9, s10
14289; GFX7-NEXT:    s_mov_b32 s8, s11
14290; GFX7-NEXT:    s_add_u32 s4, s4, s9
14291; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14292; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14293; GFX7-NEXT:    s_mov_b32 s5, s8
14294; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14295; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14296; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14297; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14298; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14299; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14300; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14301; GFX7-NEXT:    s_endpgm
14302;
14303; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14304; GFX10-WGP:       ; %bb.0: ; %entry
14305; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14306; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14307; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14308; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14309; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14310; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14311; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14312; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14313; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14314; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14315; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14316; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14317; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14318; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14319; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14320; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14321; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14322; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14323; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14324; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14325; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14326; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14327; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14328; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14329; GFX10-WGP-NEXT:    buffer_gl0_inv
14330; GFX10-WGP-NEXT:    s_endpgm
14331;
14332; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14333; GFX10-CU:       ; %bb.0: ; %entry
14334; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14335; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14336; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14337; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14338; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14339; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14340; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14341; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14342; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14343; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14344; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14345; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14346; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14347; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14348; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14349; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14350; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14351; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14352; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14353; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14354; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14355; GFX10-CU-NEXT:    s_endpgm
14356;
14357; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14358; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14359; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14360; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14361; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14362; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14363; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14364; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14365; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14366; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14368; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14369; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14370; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14371; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14372; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14374; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14375; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14378; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14379; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14380; SKIP-CACHE-INV-NEXT:    s_endpgm
14381;
14382; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14383; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14384; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14385; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14386; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14387; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14388; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14389; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14390; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14391; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14392; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14393; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14394; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14395;
14396; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14397; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14398; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14399; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14400; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14401; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14402; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14403; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14404; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14405; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14406; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14407; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14408; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14409; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14410; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14411; GFX90A-TGSPLIT-NEXT:    s_endpgm
14412;
14413; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14414; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14415; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14416; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14417; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14418; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14419; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14420; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14421; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14422; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14423; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14424; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14425; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14426;
14427; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14428; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14429; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14430; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14431; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14432; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14433; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14434; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14435; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14436; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14437; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14438; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14439; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14440; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14441; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
14442; GFX940-TGSPLIT-NEXT:    s_endpgm
14443;
14444; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14445; GFX11-WGP:       ; %bb.0: ; %entry
14446; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14447; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14448; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14449; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14450; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14451; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14452; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14453; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14454; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14455; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14456; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14457; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14458; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14459; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14460; GFX11-WGP-NEXT:    buffer_gl0_inv
14461; GFX11-WGP-NEXT:    s_endpgm
14462;
14463; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14464; GFX11-CU:       ; %bb.0: ; %entry
14465; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14466; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14467; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14468; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14469; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14470; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14471; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14472; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14473; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14474; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14475; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14476; GFX11-CU-NEXT:    s_endpgm
14477;
14478; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14479; GFX12-WGP:       ; %bb.0: ; %entry
14480; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14481; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14482; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14483; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14484; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14485; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14486; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14487; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14488; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14489; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14490; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14491; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14492; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14493; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14494; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14495; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14496; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
14497; GFX12-WGP-NEXT:    s_endpgm
14498;
14499; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14500; GFX12-CU:       ; %bb.0: ; %entry
14501; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14502; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14503; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14504; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14505; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14506; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14507; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14508; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14509; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14510; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14511; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14512; GFX12-CU-NEXT:    s_endpgm
14513    ptr %out, i32 %in, i32 %old) {
14514entry:
14515  %gep = getelementptr i32, ptr %out, i32 4
14516  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
14517  ret void
14518}
14519
14520define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
14521; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14522; GFX7:       ; %bb.0: ; %entry
14523; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14524; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14525; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14526; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14527; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14528; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14529; GFX7-NEXT:    s_mov_b32 s4, s8
14530; GFX7-NEXT:    s_mov_b32 s5, s9
14531; GFX7-NEXT:    s_mov_b32 s9, s10
14532; GFX7-NEXT:    s_mov_b32 s8, s11
14533; GFX7-NEXT:    s_add_u32 s4, s4, s9
14534; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14535; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14536; GFX7-NEXT:    s_mov_b32 s5, s8
14537; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14538; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14539; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14540; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14541; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14542; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14543; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14544; GFX7-NEXT:    s_endpgm
14545;
14546; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14547; GFX10-WGP:       ; %bb.0: ; %entry
14548; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14549; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14550; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14551; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14552; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14553; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14554; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14555; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14556; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14557; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14558; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14559; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14560; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14561; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14562; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14563; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14564; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14565; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14566; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14567; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14568; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14569; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14570; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14571; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14572; GFX10-WGP-NEXT:    buffer_gl0_inv
14573; GFX10-WGP-NEXT:    s_endpgm
14574;
14575; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14576; GFX10-CU:       ; %bb.0: ; %entry
14577; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14578; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14579; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14580; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14581; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14582; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14583; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14584; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14585; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14586; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14587; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14588; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14589; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14590; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14591; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14592; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14593; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14594; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14595; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14596; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14597; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14598; GFX10-CU-NEXT:    s_endpgm
14599;
14600; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14601; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14602; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14603; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14604; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14605; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14606; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14607; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14608; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14609; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14610; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14611; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14612; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14613; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14614; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14615; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14616; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14618; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14619; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14620; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14621; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14622; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14623; SKIP-CACHE-INV-NEXT:    s_endpgm
14624;
14625; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14626; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14627; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14628; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14629; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14630; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14631; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14632; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14633; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14634; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14635; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14636; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14637; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14638;
14639; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14640; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14641; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14642; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14643; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14644; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14645; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14646; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14647; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14648; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14649; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14650; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14651; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14652; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14653; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14654; GFX90A-TGSPLIT-NEXT:    s_endpgm
14655;
14656; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14657; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14658; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14659; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14660; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14661; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14662; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14663; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14664; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14665; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14666; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14667; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14668; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14669;
14670; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14671; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14672; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14673; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14674; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14675; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14676; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14677; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14678; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14679; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14680; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14681; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14682; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14683; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14684; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
14685; GFX940-TGSPLIT-NEXT:    s_endpgm
14686;
14687; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14688; GFX11-WGP:       ; %bb.0: ; %entry
14689; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14690; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14691; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14692; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14693; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14694; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14695; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14696; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14697; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14698; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14699; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14700; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14701; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14702; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14703; GFX11-WGP-NEXT:    buffer_gl0_inv
14704; GFX11-WGP-NEXT:    s_endpgm
14705;
14706; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14707; GFX11-CU:       ; %bb.0: ; %entry
14708; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14709; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14710; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14711; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14712; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14713; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14714; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14715; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14716; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14717; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14718; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14719; GFX11-CU-NEXT:    s_endpgm
14720;
14721; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14722; GFX12-WGP:       ; %bb.0: ; %entry
14723; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14724; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14725; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14726; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14727; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14728; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14729; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14730; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14731; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14732; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14733; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14734; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14735; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14736; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14737; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14738; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14739; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
14740; GFX12-WGP-NEXT:    s_endpgm
14741;
14742; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14743; GFX12-CU:       ; %bb.0: ; %entry
14744; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14745; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14746; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14747; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14748; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14749; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14750; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14751; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14752; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14753; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14754; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14755; GFX12-CU-NEXT:    s_endpgm
14756    ptr %out, i32 %in, i32 %old) {
14757entry:
14758  %gep = getelementptr i32, ptr %out, i32 4
14759  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
14760  ret void
14761}
14762
14763define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
14764; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14765; GFX7:       ; %bb.0: ; %entry
14766; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14767; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14768; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14769; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14770; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14771; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14772; GFX7-NEXT:    s_mov_b32 s4, s8
14773; GFX7-NEXT:    s_mov_b32 s5, s9
14774; GFX7-NEXT:    s_mov_b32 s9, s10
14775; GFX7-NEXT:    s_mov_b32 s8, s11
14776; GFX7-NEXT:    s_add_u32 s4, s4, s9
14777; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14778; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14779; GFX7-NEXT:    s_mov_b32 s5, s8
14780; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14781; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14782; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14783; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14784; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14785; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14786; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14787; GFX7-NEXT:    s_endpgm
14788;
14789; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14790; GFX10-WGP:       ; %bb.0: ; %entry
14791; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14792; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14793; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14794; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14795; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14796; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14797; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14798; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14799; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14800; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14801; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14802; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14803; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14804; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14805; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14806; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14807; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14808; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14809; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14810; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14811; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14812; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14813; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14814; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14815; GFX10-WGP-NEXT:    buffer_gl0_inv
14816; GFX10-WGP-NEXT:    s_endpgm
14817;
14818; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14819; GFX10-CU:       ; %bb.0: ; %entry
14820; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14821; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14822; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14823; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14824; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14825; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14826; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14827; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14828; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14829; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14830; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14831; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14832; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14833; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14834; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14835; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14836; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14837; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14838; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14839; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14840; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14841; GFX10-CU-NEXT:    s_endpgm
14842;
14843; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14844; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14845; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14846; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14847; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14848; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14849; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14850; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14852; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14853; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14854; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14855; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14856; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14857; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14858; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14861; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14862; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14863; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14864; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14865; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14866; SKIP-CACHE-INV-NEXT:    s_endpgm
14867;
14868; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14869; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14870; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14871; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14872; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14873; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14874; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14875; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14876; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14877; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14878; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14879; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14880; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14881;
14882; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14883; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14884; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14885; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14886; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14887; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14888; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14889; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14890; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14891; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14892; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14893; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14894; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14895; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14896; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14897; GFX90A-TGSPLIT-NEXT:    s_endpgm
14898;
14899; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14900; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14901; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14902; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14903; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14904; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14905; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14906; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14907; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14908; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14909; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14910; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14911; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14912;
14913; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14914; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14915; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14916; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14917; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14918; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14919; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14920; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14921; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14922; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14923; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14924; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14925; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14926; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14927; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
14928; GFX940-TGSPLIT-NEXT:    s_endpgm
14929;
14930; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14931; GFX11-WGP:       ; %bb.0: ; %entry
14932; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14933; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14934; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14935; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14936; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14937; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14938; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14939; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14940; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14941; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14942; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14943; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14944; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14945; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14946; GFX11-WGP-NEXT:    buffer_gl0_inv
14947; GFX11-WGP-NEXT:    s_endpgm
14948;
14949; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14950; GFX11-CU:       ; %bb.0: ; %entry
14951; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14952; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14953; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14954; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14955; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14956; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14957; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14958; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14959; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14960; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14961; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14962; GFX11-CU-NEXT:    s_endpgm
14963;
14964; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14965; GFX12-WGP:       ; %bb.0: ; %entry
14966; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14967; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14968; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14969; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14970; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14971; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14972; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14973; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14974; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14975; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14976; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14977; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14978; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14979; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14980; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14981; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14982; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
14983; GFX12-WGP-NEXT:    s_endpgm
14984;
14985; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14986; GFX12-CU:       ; %bb.0: ; %entry
14987; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14988; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14989; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14990; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14991; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14992; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14993; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14994; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14995; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14996; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14997; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14998; GFX12-CU-NEXT:    s_endpgm
14999    ptr %out, i32 %in, i32 %old) {
15000entry:
15001  %gep = getelementptr i32, ptr %out, i32 4
15002  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
15003  ret void
15004}
15005
15006define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
15007; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15008; GFX7:       ; %bb.0: ; %entry
15009; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15010; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15011; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15012; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15013; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15014; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15015; GFX7-NEXT:    s_mov_b32 s4, s8
15016; GFX7-NEXT:    s_mov_b32 s5, s9
15017; GFX7-NEXT:    s_mov_b32 s9, s10
15018; GFX7-NEXT:    s_mov_b32 s8, s11
15019; GFX7-NEXT:    s_add_u32 s4, s4, s9
15020; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15021; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15022; GFX7-NEXT:    s_mov_b32 s5, s8
15023; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15024; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15025; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15026; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15027; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15028; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15029; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15030; GFX7-NEXT:    s_endpgm
15031;
15032; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15033; GFX10-WGP:       ; %bb.0: ; %entry
15034; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15035; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15036; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15037; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15038; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15039; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15040; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15041; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15042; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15043; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15044; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15045; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15046; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15047; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15048; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15049; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15050; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15051; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15052; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15053; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15054; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15055; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15056; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15057; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15058; GFX10-WGP-NEXT:    buffer_gl0_inv
15059; GFX10-WGP-NEXT:    s_endpgm
15060;
15061; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15062; GFX10-CU:       ; %bb.0: ; %entry
15063; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15064; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15065; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15066; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15067; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15068; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15069; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15070; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15071; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15072; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15073; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15074; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15075; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15076; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15077; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15079; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15080; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15081; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15082; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15083; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15084; GFX10-CU-NEXT:    s_endpgm
15085;
15086; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15087; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15088; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15089; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15090; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15091; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15092; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15093; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15094; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15095; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15096; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15097; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15098; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15099; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15100; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15101; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15102; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15103; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15104; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15105; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15106; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15107; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15108; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15109; SKIP-CACHE-INV-NEXT:    s_endpgm
15110;
15111; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15112; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15113; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15114; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15115; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15116; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15117; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15118; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15119; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15120; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15121; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15122; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15123; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15124;
15125; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15126; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15127; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15128; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15129; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15130; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15131; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15132; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15133; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15134; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15135; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15136; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15137; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15138; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15139; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15140; GFX90A-TGSPLIT-NEXT:    s_endpgm
15141;
15142; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15143; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15144; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15145; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15146; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15147; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15148; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15149; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15150; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15151; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15152; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15153; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15154; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15155;
15156; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15157; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15158; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15159; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15160; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15161; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15162; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15163; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15164; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15165; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15166; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15167; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15168; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15169; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15170; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
15171; GFX940-TGSPLIT-NEXT:    s_endpgm
15172;
15173; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15174; GFX11-WGP:       ; %bb.0: ; %entry
15175; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15176; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15177; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15178; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15179; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15180; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15181; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15182; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15183; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15184; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15185; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15186; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15187; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15188; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15189; GFX11-WGP-NEXT:    buffer_gl0_inv
15190; GFX11-WGP-NEXT:    s_endpgm
15191;
15192; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15193; GFX11-CU:       ; %bb.0: ; %entry
15194; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15195; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15196; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15197; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15198; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15199; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15200; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15201; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15202; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15203; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15204; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15205; GFX11-CU-NEXT:    s_endpgm
15206;
15207; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15208; GFX12-WGP:       ; %bb.0: ; %entry
15209; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15210; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15211; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15212; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15213; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15214; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15215; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15216; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15217; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15218; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15219; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15220; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15221; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15222; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15223; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15224; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15225; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
15226; GFX12-WGP-NEXT:    s_endpgm
15227;
15228; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15229; GFX12-CU:       ; %bb.0: ; %entry
15230; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15231; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15232; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15233; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15234; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15235; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15236; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15237; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15238; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15239; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15240; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15241; GFX12-CU-NEXT:    s_endpgm
15242    ptr %out, i32 %in, i32 %old) {
15243entry:
15244  %gep = getelementptr i32, ptr %out, i32 4
15245  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
15246  ret void
15247}
15248
15249define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
15250; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15251; GFX7:       ; %bb.0: ; %entry
15252; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15253; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15254; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15255; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15256; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15257; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15258; GFX7-NEXT:    s_mov_b32 s4, s8
15259; GFX7-NEXT:    s_mov_b32 s5, s9
15260; GFX7-NEXT:    s_mov_b32 s9, s10
15261; GFX7-NEXT:    s_mov_b32 s8, s11
15262; GFX7-NEXT:    s_add_u32 s4, s4, s9
15263; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15264; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15265; GFX7-NEXT:    s_mov_b32 s5, s8
15266; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15267; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15268; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15269; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15270; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15271; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15272; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15273; GFX7-NEXT:    s_endpgm
15274;
15275; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15276; GFX10-WGP:       ; %bb.0: ; %entry
15277; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15278; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15279; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15280; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15281; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15282; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15283; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15284; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15285; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15286; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15287; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15288; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15289; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15290; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15291; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15292; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15293; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15294; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15295; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15296; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15297; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15298; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15299; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15300; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15301; GFX10-WGP-NEXT:    buffer_gl0_inv
15302; GFX10-WGP-NEXT:    s_endpgm
15303;
15304; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15305; GFX10-CU:       ; %bb.0: ; %entry
15306; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15307; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15308; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15309; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15310; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15311; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15312; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15313; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15314; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15315; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15316; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15317; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15318; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15319; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15320; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15321; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15322; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15323; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15324; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15325; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15326; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15327; GFX10-CU-NEXT:    s_endpgm
15328;
15329; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15330; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15331; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15332; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15333; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15334; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15335; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15336; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15337; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15338; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15339; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15340; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15341; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15342; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15343; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15344; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15345; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15347; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15348; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15351; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15352; SKIP-CACHE-INV-NEXT:    s_endpgm
15353;
15354; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15355; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15356; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15357; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15358; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15359; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15360; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15361; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15362; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15363; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15364; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15365; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15366; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15367;
15368; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15369; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15370; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15371; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15372; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15373; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15374; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15375; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15376; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15377; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15378; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15379; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15380; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15381; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15382; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15383; GFX90A-TGSPLIT-NEXT:    s_endpgm
15384;
15385; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15386; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15387; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15388; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15389; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15390; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15391; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15392; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15393; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15394; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15395; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15396; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15397; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15398;
15399; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15400; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15401; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15402; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15403; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15404; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15405; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15406; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15407; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15408; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15409; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15410; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15411; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15412; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15413; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
15414; GFX940-TGSPLIT-NEXT:    s_endpgm
15415;
15416; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15417; GFX11-WGP:       ; %bb.0: ; %entry
15418; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15419; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15420; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15421; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15422; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15423; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15424; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15425; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15426; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15427; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15428; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15429; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15430; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15431; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15432; GFX11-WGP-NEXT:    buffer_gl0_inv
15433; GFX11-WGP-NEXT:    s_endpgm
15434;
15435; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15436; GFX11-CU:       ; %bb.0: ; %entry
15437; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15438; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15439; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15440; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15441; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15442; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15443; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15444; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15445; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15446; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15447; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15448; GFX11-CU-NEXT:    s_endpgm
15449;
15450; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15451; GFX12-WGP:       ; %bb.0: ; %entry
15452; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15453; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15454; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15455; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15456; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15457; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15458; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15459; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15460; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15461; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15462; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15463; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15464; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15465; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15466; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15467; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15468; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
15469; GFX12-WGP-NEXT:    s_endpgm
15470;
15471; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15472; GFX12-CU:       ; %bb.0: ; %entry
15473; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15474; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15475; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15476; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15477; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15478; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15479; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15480; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15481; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15482; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15483; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15484; GFX12-CU-NEXT:    s_endpgm
15485    ptr %out, i32 %in, i32 %old) {
15486entry:
15487  %gep = getelementptr i32, ptr %out, i32 4
15488  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
15489  ret void
15490}
15491
15492define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
15493; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15494; GFX7:       ; %bb.0: ; %entry
15495; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15496; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15497; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15498; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15499; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15500; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15501; GFX7-NEXT:    s_mov_b32 s4, s8
15502; GFX7-NEXT:    s_mov_b32 s5, s9
15503; GFX7-NEXT:    s_mov_b32 s9, s10
15504; GFX7-NEXT:    s_mov_b32 s8, s11
15505; GFX7-NEXT:    s_add_u32 s4, s4, s9
15506; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15507; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15508; GFX7-NEXT:    s_mov_b32 s5, s8
15509; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15510; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15511; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15512; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15513; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15514; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15515; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15516; GFX7-NEXT:    s_endpgm
15517;
15518; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15519; GFX10-WGP:       ; %bb.0: ; %entry
15520; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15521; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15522; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15523; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15524; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15525; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15526; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15527; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15528; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15529; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15530; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15531; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15532; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15533; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15534; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15535; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15536; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15537; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15538; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15539; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15540; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15541; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15542; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15543; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15544; GFX10-WGP-NEXT:    buffer_gl0_inv
15545; GFX10-WGP-NEXT:    s_endpgm
15546;
15547; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15548; GFX10-CU:       ; %bb.0: ; %entry
15549; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15550; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15551; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15552; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15553; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15554; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15555; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15556; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15557; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15558; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15559; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15560; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15561; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15562; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15563; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15564; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15565; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15566; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15567; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15568; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15569; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15570; GFX10-CU-NEXT:    s_endpgm
15571;
15572; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15573; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15574; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15575; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15576; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15577; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15578; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15579; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15580; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15581; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15582; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15583; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15584; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15585; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15586; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15587; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15588; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15590; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15592; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15593; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15594; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15595; SKIP-CACHE-INV-NEXT:    s_endpgm
15596;
15597; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15598; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15599; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15600; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15601; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15602; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15603; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15604; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15605; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15606; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15607; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15608; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15609; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15610;
15611; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15612; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15613; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15614; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15615; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15616; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15617; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15618; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15619; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15620; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15621; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15622; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15623; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15624; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15625; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15626; GFX90A-TGSPLIT-NEXT:    s_endpgm
15627;
15628; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15629; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15630; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15631; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15632; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15633; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15634; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15635; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15636; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15637; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15638; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15639; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15640; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15641;
15642; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15643; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15644; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15645; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15646; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15647; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15648; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15649; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15650; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15651; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15652; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15653; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15654; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15655; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15656; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
15657; GFX940-TGSPLIT-NEXT:    s_endpgm
15658;
15659; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15660; GFX11-WGP:       ; %bb.0: ; %entry
15661; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15662; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15663; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15664; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15665; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15666; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15667; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15668; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15669; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15670; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15671; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15672; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15673; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15674; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15675; GFX11-WGP-NEXT:    buffer_gl0_inv
15676; GFX11-WGP-NEXT:    s_endpgm
15677;
15678; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15679; GFX11-CU:       ; %bb.0: ; %entry
15680; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15681; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15682; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15683; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15684; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15685; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15686; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15687; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15688; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15689; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15690; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15691; GFX11-CU-NEXT:    s_endpgm
15692;
15693; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15694; GFX12-WGP:       ; %bb.0: ; %entry
15695; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15696; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15697; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15698; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15699; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15700; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15701; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15702; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15703; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15704; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15705; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15706; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15707; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15708; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15709; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15710; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15711; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
15712; GFX12-WGP-NEXT:    s_endpgm
15713;
15714; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15715; GFX12-CU:       ; %bb.0: ; %entry
15716; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15717; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15718; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15719; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15720; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15721; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15722; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15723; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15724; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15725; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15726; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15727; GFX12-CU-NEXT:    s_endpgm
15728    ptr %out, i32 %in, i32 %old) {
15729entry:
15730  %gep = getelementptr i32, ptr %out, i32 4
15731  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
15732  ret void
15733}
15734
15735define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
15736; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15737; GFX7:       ; %bb.0: ; %entry
15738; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15739; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15740; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15741; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15742; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15743; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15744; GFX7-NEXT:    s_mov_b32 s4, s8
15745; GFX7-NEXT:    s_mov_b32 s5, s9
15746; GFX7-NEXT:    s_mov_b32 s9, s10
15747; GFX7-NEXT:    s_mov_b32 s8, s11
15748; GFX7-NEXT:    s_add_u32 s4, s4, s9
15749; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15750; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15751; GFX7-NEXT:    s_mov_b32 s5, s8
15752; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15753; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15754; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15755; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15756; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15757; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15758; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15759; GFX7-NEXT:    s_endpgm
15760;
15761; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15762; GFX10-WGP:       ; %bb.0: ; %entry
15763; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15764; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15765; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15766; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15767; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15768; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15769; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15770; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15771; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15772; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15773; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15774; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15775; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15776; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15777; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15778; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15779; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15780; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15781; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15782; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15783; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15784; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15785; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15786; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15787; GFX10-WGP-NEXT:    buffer_gl0_inv
15788; GFX10-WGP-NEXT:    s_endpgm
15789;
15790; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15791; GFX10-CU:       ; %bb.0: ; %entry
15792; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15793; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15794; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15795; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15796; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15797; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15798; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15799; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15800; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15801; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15802; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15803; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15804; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15805; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15806; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15807; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15808; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15809; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15810; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15811; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15812; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15813; GFX10-CU-NEXT:    s_endpgm
15814;
15815; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15816; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15817; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15818; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15819; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15820; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15821; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15822; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15823; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15824; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15825; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15826; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15827; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15828; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15829; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15830; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15831; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15832; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15833; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15834; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15835; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15836; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15837; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15838; SKIP-CACHE-INV-NEXT:    s_endpgm
15839;
15840; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15841; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15842; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15843; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15844; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15845; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15846; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15847; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15848; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15849; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15850; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15851; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15852; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15853;
15854; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15855; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15856; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15857; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15858; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15859; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15860; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15861; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15862; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15863; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15864; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15865; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15866; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15867; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15868; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15869; GFX90A-TGSPLIT-NEXT:    s_endpgm
15870;
15871; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15872; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15873; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15874; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15875; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15876; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15877; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15878; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15879; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15880; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15881; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15882; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15883; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15884;
15885; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15886; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15887; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15888; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15889; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15890; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15891; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15892; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15893; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15894; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15895; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15896; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15897; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15898; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15899; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
15900; GFX940-TGSPLIT-NEXT:    s_endpgm
15901;
15902; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15903; GFX11-WGP:       ; %bb.0: ; %entry
15904; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15905; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15906; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15907; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15908; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15909; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15910; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15911; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15912; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15913; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15914; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15915; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15916; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15917; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15918; GFX11-WGP-NEXT:    buffer_gl0_inv
15919; GFX11-WGP-NEXT:    s_endpgm
15920;
15921; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15922; GFX11-CU:       ; %bb.0: ; %entry
15923; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15924; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15925; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15926; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15927; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15928; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15929; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15930; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15931; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15932; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15933; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15934; GFX11-CU-NEXT:    s_endpgm
15935;
15936; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15937; GFX12-WGP:       ; %bb.0: ; %entry
15938; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15939; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15940; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15941; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15942; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15943; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15944; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15945; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15946; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15947; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15948; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15949; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15950; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15951; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15952; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15953; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15954; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
15955; GFX12-WGP-NEXT:    s_endpgm
15956;
15957; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15958; GFX12-CU:       ; %bb.0: ; %entry
15959; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15960; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15961; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15962; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15963; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15964; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15965; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15966; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15967; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15968; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15969; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15970; GFX12-CU-NEXT:    s_endpgm
15971    ptr %out, i32 %in, i32 %old) {
15972entry:
15973  %gep = getelementptr i32, ptr %out, i32 4
15974  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
15975  ret void
15976}
15977
15978define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
15979; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
15980; GFX7:       ; %bb.0: ; %entry
15981; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
15982; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15983; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
15984; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
15985; GFX7-NEXT:    s_mov_b64 s[12:13], 16
15986; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15987; GFX7-NEXT:    s_mov_b32 s6, s4
15988; GFX7-NEXT:    s_mov_b32 s7, s5
15989; GFX7-NEXT:    s_mov_b32 s11, s12
15990; GFX7-NEXT:    s_mov_b32 s10, s13
15991; GFX7-NEXT:    s_add_u32 s6, s6, s11
15992; GFX7-NEXT:    s_addc_u32 s10, s7, s10
15993; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15994; GFX7-NEXT:    s_mov_b32 s7, s10
15995; GFX7-NEXT:    v_mov_b32_e32 v2, s9
15996; GFX7-NEXT:    v_mov_b32_e32 v0, s8
15997; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15998; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15999; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16000; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16001; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16002; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16003; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16004; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16005; GFX7-NEXT:    flat_store_dword v[0:1], v2
16006; GFX7-NEXT:    s_endpgm
16007;
16008; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16009; GFX10-WGP:       ; %bb.0: ; %entry
16010; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16011; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16012; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16013; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16014; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16015; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16016; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16017; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16018; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16019; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16020; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16021; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16022; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16023; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16024; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16025; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16026; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16027; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16028; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16029; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16030; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16031; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16032; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16033; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16034; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16035; GFX10-WGP-NEXT:    s_endpgm
16036;
16037; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16038; GFX10-CU:       ; %bb.0: ; %entry
16039; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16040; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16041; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16042; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16043; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16044; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16045; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16046; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16047; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16048; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16049; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16050; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16051; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16052; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16053; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16054; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16055; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16056; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16057; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16058; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16059; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16060; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16061; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16062; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16063; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16064; GFX10-CU-NEXT:    s_endpgm
16065;
16066; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16067; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16068; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16069; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16070; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16071; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16072; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16073; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16074; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16075; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16076; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16077; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16078; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16079; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16080; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16081; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16082; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16083; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16084; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16085; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16086; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16088; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16091; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16092; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16093; SKIP-CACHE-INV-NEXT:    s_endpgm
16094;
16095; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16096; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16097; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16098; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16099; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16100; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16101; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16102; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16103; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16104; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16105; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16106; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16107; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16109; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16110; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16111;
16112; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16113; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16114; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16115; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16116; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16117; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16118; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16119; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16120; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16121; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16122; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16123; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16124; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16125; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16126; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16127; GFX90A-TGSPLIT-NEXT:    s_endpgm
16128;
16129; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16130; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16131; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16132; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16133; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16134; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16135; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16136; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16137; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16138; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16139; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16140; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16141; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16142; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16143; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16144; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16145;
16146; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16147; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16148; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16149; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16150; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16151; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16152; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16153; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16154; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16155; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16156; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16157; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16158; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16159; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16160; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16161; GFX940-TGSPLIT-NEXT:    s_endpgm
16162;
16163; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16164; GFX11-WGP:       ; %bb.0: ; %entry
16165; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16166; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16167; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16168; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16169; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16170; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16171; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16172; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16173; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16174; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16175; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16176; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16177; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16178; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16179; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16180; GFX11-WGP-NEXT:    s_endpgm
16181;
16182; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16183; GFX11-CU:       ; %bb.0: ; %entry
16184; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16185; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16186; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16187; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16188; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16189; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16190; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16191; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16192; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16193; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16194; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16195; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16196; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16197; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16198; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16199; GFX11-CU-NEXT:    s_endpgm
16200;
16201; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16202; GFX12-WGP:       ; %bb.0: ; %entry
16203; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16204; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16205; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16206; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16207; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16208; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16209; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16210; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16211; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16212; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16213; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16214; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16215; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16216; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16217; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16218; GFX12-WGP-NEXT:    s_endpgm
16219;
16220; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16221; GFX12-CU:       ; %bb.0: ; %entry
16222; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16223; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16224; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16225; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16226; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16227; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16228; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16229; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16230; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16231; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16232; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16233; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16234; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16235; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16236; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16237; GFX12-CU-NEXT:    s_endpgm
16238    ptr %out, i32 %in, i32 %old) {
16239entry:
16240  %gep = getelementptr i32, ptr %out, i32 4
16241  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
16242  %val0 = extractvalue { i32, i1 } %val, 0
16243  store i32 %val0, ptr %out, align 4
16244  ret void
16245}
16246
16247define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
16248; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16249; GFX7:       ; %bb.0: ; %entry
16250; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16251; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16252; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16253; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16254; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16256; GFX7-NEXT:    s_mov_b32 s6, s4
16257; GFX7-NEXT:    s_mov_b32 s7, s5
16258; GFX7-NEXT:    s_mov_b32 s11, s12
16259; GFX7-NEXT:    s_mov_b32 s10, s13
16260; GFX7-NEXT:    s_add_u32 s6, s6, s11
16261; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16262; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16263; GFX7-NEXT:    s_mov_b32 s7, s10
16264; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16265; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16266; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16267; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16268; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16269; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16270; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16271; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16272; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16273; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16274; GFX7-NEXT:    flat_store_dword v[0:1], v2
16275; GFX7-NEXT:    s_endpgm
16276;
16277; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16278; GFX10-WGP:       ; %bb.0: ; %entry
16279; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16280; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16281; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16282; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16283; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16284; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16285; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16286; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16287; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16288; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16289; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16290; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16291; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16292; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16293; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16294; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16295; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16296; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16297; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16298; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16299; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16300; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16301; GFX10-WGP-NEXT:    buffer_gl0_inv
16302; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16303; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16304; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16305; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16306; GFX10-WGP-NEXT:    s_endpgm
16307;
16308; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16309; GFX10-CU:       ; %bb.0: ; %entry
16310; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16311; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16312; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16313; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16314; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16315; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16316; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16317; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16318; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16319; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16320; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16321; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16322; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16323; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16324; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16325; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16326; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16327; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16328; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16329; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16330; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16331; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16332; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16333; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16334; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16335; GFX10-CU-NEXT:    s_endpgm
16336;
16337; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16338; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16339; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16340; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16341; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16342; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16343; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16344; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16345; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16346; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16347; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16348; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16349; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16350; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16351; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16352; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16355; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16357; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16359; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16361; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16362; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16363; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16364; SKIP-CACHE-INV-NEXT:    s_endpgm
16365;
16366; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16367; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16368; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16369; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16370; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16371; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16372; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16373; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16374; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16375; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16376; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16377; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16378; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16379; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16380; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16381; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16382;
16383; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16384; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16385; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16386; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16387; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16388; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16389; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16390; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16391; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16392; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16393; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16394; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16395; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16396; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16397; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16398; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16399; GFX90A-TGSPLIT-NEXT:    s_endpgm
16400;
16401; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16402; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16403; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16404; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16405; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16406; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16407; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16408; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16409; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16410; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16411; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16412; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16413; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16414; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16415; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16416; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16417;
16418; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16419; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16420; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16421; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16422; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16423; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16424; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16425; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16426; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16427; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16428; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16429; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16430; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16431; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
16432; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16433; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16434; GFX940-TGSPLIT-NEXT:    s_endpgm
16435;
16436; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16437; GFX11-WGP:       ; %bb.0: ; %entry
16438; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16439; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16440; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16441; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16442; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16443; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16444; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16445; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16446; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16447; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16448; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16449; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16450; GFX11-WGP-NEXT:    buffer_gl0_inv
16451; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16452; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16453; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16454; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16455; GFX11-WGP-NEXT:    s_endpgm
16456;
16457; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16458; GFX11-CU:       ; %bb.0: ; %entry
16459; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16460; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16461; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16462; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16463; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16464; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16465; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16466; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16467; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16468; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16469; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16470; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16471; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16472; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16473; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16474; GFX11-CU-NEXT:    s_endpgm
16475;
16476; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16477; GFX12-WGP:       ; %bb.0: ; %entry
16478; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16479; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16480; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16481; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16482; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16483; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16484; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16485; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16486; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16487; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16488; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16489; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16490; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
16491; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16492; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16493; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
16494; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16495; GFX12-WGP-NEXT:    s_endpgm
16496;
16497; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16498; GFX12-CU:       ; %bb.0: ; %entry
16499; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16500; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16501; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16502; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16503; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16504; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16505; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16506; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16507; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16508; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16509; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16510; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16511; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16512; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16513; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16514; GFX12-CU-NEXT:    s_endpgm
16515    ptr %out, i32 %in, i32 %old) {
16516entry:
16517  %gep = getelementptr i32, ptr %out, i32 4
16518  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
16519  %val0 = extractvalue { i32, i1 } %val, 0
16520  store i32 %val0, ptr %out, align 4
16521  ret void
16522}
16523
16524define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
16525; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16526; GFX7:       ; %bb.0: ; %entry
16527; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16528; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16529; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16530; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16531; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16532; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16533; GFX7-NEXT:    s_mov_b32 s6, s4
16534; GFX7-NEXT:    s_mov_b32 s7, s5
16535; GFX7-NEXT:    s_mov_b32 s11, s12
16536; GFX7-NEXT:    s_mov_b32 s10, s13
16537; GFX7-NEXT:    s_add_u32 s6, s6, s11
16538; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16539; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16540; GFX7-NEXT:    s_mov_b32 s7, s10
16541; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16542; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16543; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16544; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16545; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16546; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16547; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16548; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16549; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16550; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16551; GFX7-NEXT:    flat_store_dword v[0:1], v2
16552; GFX7-NEXT:    s_endpgm
16553;
16554; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16555; GFX10-WGP:       ; %bb.0: ; %entry
16556; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16557; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16558; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16559; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16560; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16561; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16562; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16563; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16564; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16565; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16566; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16567; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16568; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16569; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16570; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16571; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16572; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16573; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16574; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16575; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16576; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16577; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16578; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16579; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16580; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16581; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16582; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16583; GFX10-WGP-NEXT:    s_endpgm
16584;
16585; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16586; GFX10-CU:       ; %bb.0: ; %entry
16587; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16588; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16589; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16590; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16591; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16592; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16593; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16594; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16595; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16596; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16597; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16598; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16599; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16600; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16601; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16602; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16603; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16604; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16605; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16606; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16607; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16608; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16609; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16610; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16611; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16612; GFX10-CU-NEXT:    s_endpgm
16613;
16614; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16615; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16616; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16617; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16618; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16619; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16620; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16621; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16622; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16623; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16624; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16625; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16626; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16627; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16628; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16630; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16631; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16632; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16633; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16634; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16635; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16636; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16639; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16640; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16641; SKIP-CACHE-INV-NEXT:    s_endpgm
16642;
16643; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16644; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16645; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16646; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16647; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16648; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16649; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16650; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16651; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16652; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16653; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16654; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16655; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16656; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16657; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16658; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16659;
16660; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16661; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16662; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16663; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16664; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16665; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16666; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16667; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16668; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16669; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16670; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16671; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16672; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16673; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16674; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16675; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16676; GFX90A-TGSPLIT-NEXT:    s_endpgm
16677;
16678; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16679; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16680; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16681; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16682; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16683; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16685; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16686; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16687; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16688; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16689; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16690; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16691; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16692; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16693; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16694;
16695; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16696; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16697; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16698; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16699; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16700; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16701; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16702; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16703; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16704; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16705; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16706; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16707; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16708; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16709; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16710; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16711; GFX940-TGSPLIT-NEXT:    s_endpgm
16712;
16713; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16714; GFX11-WGP:       ; %bb.0: ; %entry
16715; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16716; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16717; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16718; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16719; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16720; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16721; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16722; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16723; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16724; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16725; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16726; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16727; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16728; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16729; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16730; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16731; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16732; GFX11-WGP-NEXT:    s_endpgm
16733;
16734; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16735; GFX11-CU:       ; %bb.0: ; %entry
16736; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16737; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16738; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16739; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16740; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16741; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16742; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16743; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16744; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16745; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16746; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16747; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16748; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16749; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16750; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16751; GFX11-CU-NEXT:    s_endpgm
16752;
16753; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16754; GFX12-WGP:       ; %bb.0: ; %entry
16755; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16756; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16757; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16758; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16759; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16760; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16761; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16762; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16763; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16764; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16765; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
16766; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
16767; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16768; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16769; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16770; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16771; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16772; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16773; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16774; GFX12-WGP-NEXT:    s_endpgm
16775;
16776; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16777; GFX12-CU:       ; %bb.0: ; %entry
16778; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16779; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16780; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16781; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16782; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16783; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16784; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16785; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16786; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16787; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16788; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16789; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16790; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16791; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16792; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16793; GFX12-CU-NEXT:    s_endpgm
16794    ptr %out, i32 %in, i32 %old) {
16795entry:
16796  %gep = getelementptr i32, ptr %out, i32 4
16797  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
16798  %val0 = extractvalue { i32, i1 } %val, 0
16799  store i32 %val0, ptr %out, align 4
16800  ret void
16801}
16802
16803define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
16804; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16805; GFX7:       ; %bb.0: ; %entry
16806; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16807; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16808; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16809; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16810; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16811; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16812; GFX7-NEXT:    s_mov_b32 s6, s4
16813; GFX7-NEXT:    s_mov_b32 s7, s5
16814; GFX7-NEXT:    s_mov_b32 s11, s12
16815; GFX7-NEXT:    s_mov_b32 s10, s13
16816; GFX7-NEXT:    s_add_u32 s6, s6, s11
16817; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16818; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16819; GFX7-NEXT:    s_mov_b32 s7, s10
16820; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16821; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16822; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16823; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16824; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16825; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16826; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16827; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16828; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16829; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16830; GFX7-NEXT:    flat_store_dword v[0:1], v2
16831; GFX7-NEXT:    s_endpgm
16832;
16833; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16834; GFX10-WGP:       ; %bb.0: ; %entry
16835; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16836; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16837; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16838; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16839; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16840; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16841; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16842; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16843; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16844; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16845; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16846; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16847; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16848; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16849; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16850; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16851; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16852; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16853; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16854; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16855; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16856; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16857; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16858; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16859; GFX10-WGP-NEXT:    buffer_gl0_inv
16860; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16861; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16862; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16863; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16864; GFX10-WGP-NEXT:    s_endpgm
16865;
16866; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16867; GFX10-CU:       ; %bb.0: ; %entry
16868; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16869; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16870; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16871; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16872; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16873; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16874; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16875; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16876; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16877; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16878; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16879; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16880; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16881; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16882; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16883; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16884; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16885; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16886; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16887; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16888; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16889; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16890; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16891; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16892; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16893; GFX10-CU-NEXT:    s_endpgm
16894;
16895; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16896; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16897; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16898; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16899; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16900; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16901; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16902; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16903; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16904; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16905; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16906; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16907; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16908; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16909; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16910; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16911; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16913; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16917; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16919; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16920; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16921; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16922; SKIP-CACHE-INV-NEXT:    s_endpgm
16923;
16924; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16925; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16926; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16927; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16928; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16929; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16930; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16931; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16932; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16933; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16934; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16935; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16936; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16937; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16938; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16939; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16940;
16941; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16942; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16943; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16944; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16945; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16946; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16947; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16948; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16949; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16950; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16951; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16952; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16953; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16954; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16955; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16956; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16957; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16958; GFX90A-TGSPLIT-NEXT:    s_endpgm
16959;
16960; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16961; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16962; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16963; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16964; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16965; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16966; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16967; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16968; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16969; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16970; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16971; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16972; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16973; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16974; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16975; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16976;
16977; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16978; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16979; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16980; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16981; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16982; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16983; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16984; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16985; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16986; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16987; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16988; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16989; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16990; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16991; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
16992; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16993; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16994; GFX940-TGSPLIT-NEXT:    s_endpgm
16995;
16996; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16997; GFX11-WGP:       ; %bb.0: ; %entry
16998; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16999; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17000; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17001; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17002; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17003; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17004; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17005; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17006; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17007; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17008; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17009; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17010; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17011; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17012; GFX11-WGP-NEXT:    buffer_gl0_inv
17013; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17014; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17015; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17016; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17017; GFX11-WGP-NEXT:    s_endpgm
17018;
17019; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17020; GFX11-CU:       ; %bb.0: ; %entry
17021; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17022; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17023; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17024; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17025; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17026; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17027; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17028; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17029; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17030; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17031; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17032; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17033; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17034; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17035; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17036; GFX11-CU-NEXT:    s_endpgm
17037;
17038; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17039; GFX12-WGP:       ; %bb.0: ; %entry
17040; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17041; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17042; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17043; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17044; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17045; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17046; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17047; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17048; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17049; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17050; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17051; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17052; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17053; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17054; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17055; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17056; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17057; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17058; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
17059; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17060; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17061; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
17062; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17063; GFX12-WGP-NEXT:    s_endpgm
17064;
17065; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17066; GFX12-CU:       ; %bb.0: ; %entry
17067; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17068; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17069; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17070; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17071; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17072; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17073; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17074; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17075; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17076; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17077; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17078; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17079; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17080; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17081; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17082; GFX12-CU-NEXT:    s_endpgm
17083    ptr %out, i32 %in, i32 %old) {
17084entry:
17085  %gep = getelementptr i32, ptr %out, i32 4
17086  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
17087  %val0 = extractvalue { i32, i1 } %val, 0
17088  store i32 %val0, ptr %out, align 4
17089  ret void
17090}
17091
17092define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
17093; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17094; GFX7:       ; %bb.0: ; %entry
17095; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17096; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17097; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17098; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17099; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17100; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17101; GFX7-NEXT:    s_mov_b32 s6, s4
17102; GFX7-NEXT:    s_mov_b32 s7, s5
17103; GFX7-NEXT:    s_mov_b32 s11, s12
17104; GFX7-NEXT:    s_mov_b32 s10, s13
17105; GFX7-NEXT:    s_add_u32 s6, s6, s11
17106; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17107; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17108; GFX7-NEXT:    s_mov_b32 s7, s10
17109; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17110; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17111; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17112; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17113; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17114; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17115; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17116; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17117; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17118; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17119; GFX7-NEXT:    flat_store_dword v[0:1], v2
17120; GFX7-NEXT:    s_endpgm
17121;
17122; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17123; GFX10-WGP:       ; %bb.0: ; %entry
17124; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17125; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17126; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17127; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17128; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17129; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17130; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17131; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17132; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17133; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17134; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17135; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17136; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17137; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17138; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17139; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17140; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17141; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17142; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17143; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17144; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17145; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17146; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17147; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17148; GFX10-WGP-NEXT:    buffer_gl0_inv
17149; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17150; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17151; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17152; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17153; GFX10-WGP-NEXT:    s_endpgm
17154;
17155; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17156; GFX10-CU:       ; %bb.0: ; %entry
17157; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17158; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17159; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17160; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17161; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17162; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17163; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17164; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17165; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17166; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17167; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17168; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17169; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17170; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17171; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17172; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17173; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17174; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17175; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17176; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17177; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17178; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17179; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17180; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17181; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17182; GFX10-CU-NEXT:    s_endpgm
17183;
17184; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17185; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17186; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17187; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17188; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17189; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17190; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17191; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17192; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17193; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17194; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17195; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17196; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17197; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17198; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17199; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17200; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17201; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17202; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17203; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17206; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17207; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17209; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17210; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17211; SKIP-CACHE-INV-NEXT:    s_endpgm
17212;
17213; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17214; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17215; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17216; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17217; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17218; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17219; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17220; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17221; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17222; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17223; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17224; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17225; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17226; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17227; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17228; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17229;
17230; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17231; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17232; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17233; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17234; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17235; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17236; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17237; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17238; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17239; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17240; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17241; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17242; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17243; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17244; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17245; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17246; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17247; GFX90A-TGSPLIT-NEXT:    s_endpgm
17248;
17249; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17250; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17251; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17252; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17253; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17254; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17255; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17256; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17257; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17258; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17259; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17260; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17261; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17262; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17263; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17264; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17265;
17266; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17267; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17268; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17269; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17270; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17271; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17272; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17273; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17274; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17275; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17276; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17277; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17278; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17279; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17280; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
17281; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17282; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17283; GFX940-TGSPLIT-NEXT:    s_endpgm
17284;
17285; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17286; GFX11-WGP:       ; %bb.0: ; %entry
17287; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17288; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17289; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17290; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17291; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17292; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17293; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17294; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17295; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17296; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17297; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17298; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17299; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17300; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17301; GFX11-WGP-NEXT:    buffer_gl0_inv
17302; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17303; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17304; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17305; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17306; GFX11-WGP-NEXT:    s_endpgm
17307;
17308; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17309; GFX11-CU:       ; %bb.0: ; %entry
17310; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17311; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17312; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17313; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17314; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17315; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17316; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17317; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17318; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17319; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17320; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17321; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17322; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17323; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17324; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17325; GFX11-CU-NEXT:    s_endpgm
17326;
17327; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17328; GFX12-WGP:       ; %bb.0: ; %entry
17329; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17330; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17331; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17332; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17333; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17334; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17335; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17336; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17337; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17338; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17339; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17340; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17341; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17342; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17343; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17344; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17345; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17346; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17347; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
17348; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17349; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17350; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
17351; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17352; GFX12-WGP-NEXT:    s_endpgm
17353;
17354; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17355; GFX12-CU:       ; %bb.0: ; %entry
17356; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17357; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17358; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17359; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17360; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17361; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17362; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17363; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17364; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17365; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17366; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17367; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17368; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17369; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17370; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17371; GFX12-CU-NEXT:    s_endpgm
17372    ptr %out, i32 %in, i32 %old) {
17373entry:
17374  %gep = getelementptr i32, ptr %out, i32 4
17375  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
17376  %val0 = extractvalue { i32, i1 } %val, 0
17377  store i32 %val0, ptr %out, align 4
17378  ret void
17379}
17380
17381define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
17382; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17383; GFX7:       ; %bb.0: ; %entry
17384; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17385; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17386; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17387; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17388; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17389; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17390; GFX7-NEXT:    s_mov_b32 s6, s4
17391; GFX7-NEXT:    s_mov_b32 s7, s5
17392; GFX7-NEXT:    s_mov_b32 s11, s12
17393; GFX7-NEXT:    s_mov_b32 s10, s13
17394; GFX7-NEXT:    s_add_u32 s6, s6, s11
17395; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17396; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17397; GFX7-NEXT:    s_mov_b32 s7, s10
17398; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17399; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17400; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17401; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17402; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17403; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17404; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17405; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17406; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17407; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17408; GFX7-NEXT:    flat_store_dword v[0:1], v2
17409; GFX7-NEXT:    s_endpgm
17410;
17411; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17412; GFX10-WGP:       ; %bb.0: ; %entry
17413; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17414; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17415; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17416; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17417; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17418; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17419; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17420; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17421; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17422; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17423; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17424; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17425; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17426; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17427; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17428; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17429; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17430; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17431; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17432; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17433; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17434; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17435; GFX10-WGP-NEXT:    buffer_gl0_inv
17436; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17437; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17438; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17439; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17440; GFX10-WGP-NEXT:    s_endpgm
17441;
17442; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17443; GFX10-CU:       ; %bb.0: ; %entry
17444; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17445; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17446; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17447; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17448; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17449; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17450; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17451; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17452; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17453; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17454; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17455; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17456; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17457; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17458; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17459; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17460; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17461; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17462; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17463; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17464; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17465; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17466; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17467; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17468; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17469; GFX10-CU-NEXT:    s_endpgm
17470;
17471; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17472; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17473; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17474; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17475; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17476; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17477; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17478; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17479; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17481; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17483; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17484; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17485; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17486; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17487; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17489; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17493; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17496; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17497; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17498; SKIP-CACHE-INV-NEXT:    s_endpgm
17499;
17500; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17501; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17502; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17503; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17505; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17506; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17507; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17508; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17509; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17510; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17511; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17512; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17513; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17514; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17515; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17516;
17517; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17518; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17519; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17520; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17521; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17522; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17523; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17524; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17525; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17526; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17527; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17528; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17529; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17530; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17531; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17532; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17533; GFX90A-TGSPLIT-NEXT:    s_endpgm
17534;
17535; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17536; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17537; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17538; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17539; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17540; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17541; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17542; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17543; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17544; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17545; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17546; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17547; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17548; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17549; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17550; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17551;
17552; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17553; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17554; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17555; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17556; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17557; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17558; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17559; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17560; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17561; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17562; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17563; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17564; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17565; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
17566; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17567; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17568; GFX940-TGSPLIT-NEXT:    s_endpgm
17569;
17570; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17571; GFX11-WGP:       ; %bb.0: ; %entry
17572; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17573; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17574; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17575; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17576; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17577; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17578; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17579; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17580; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17581; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17582; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17583; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17584; GFX11-WGP-NEXT:    buffer_gl0_inv
17585; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17586; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17587; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17588; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17589; GFX11-WGP-NEXT:    s_endpgm
17590;
17591; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17592; GFX11-CU:       ; %bb.0: ; %entry
17593; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17594; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17595; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17596; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17597; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17598; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17599; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17600; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17601; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17602; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17603; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17604; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17605; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17606; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17607; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17608; GFX11-CU-NEXT:    s_endpgm
17609;
17610; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17611; GFX12-WGP:       ; %bb.0: ; %entry
17612; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17613; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17614; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17615; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17616; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17617; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17618; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17619; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17620; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17621; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17622; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17623; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17624; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17625; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17626; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
17627; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17628; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17629; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
17630; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17631; GFX12-WGP-NEXT:    s_endpgm
17632;
17633; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17634; GFX12-CU:       ; %bb.0: ; %entry
17635; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17636; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17637; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17638; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17639; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17640; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17641; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17642; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17643; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17644; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17645; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17646; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17647; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17648; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17649; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17650; GFX12-CU-NEXT:    s_endpgm
17651    ptr %out, i32 %in, i32 %old) {
17652entry:
17653  %gep = getelementptr i32, ptr %out, i32 4
17654  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
17655  %val0 = extractvalue { i32, i1 } %val, 0
17656  store i32 %val0, ptr %out, align 4
17657  ret void
17658}
17659
17660define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
17661; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17662; GFX7:       ; %bb.0: ; %entry
17663; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17664; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17665; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17666; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17667; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17668; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17669; GFX7-NEXT:    s_mov_b32 s6, s4
17670; GFX7-NEXT:    s_mov_b32 s7, s5
17671; GFX7-NEXT:    s_mov_b32 s11, s12
17672; GFX7-NEXT:    s_mov_b32 s10, s13
17673; GFX7-NEXT:    s_add_u32 s6, s6, s11
17674; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17675; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17676; GFX7-NEXT:    s_mov_b32 s7, s10
17677; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17678; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17679; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17680; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17681; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17682; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17683; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17684; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17685; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17686; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17687; GFX7-NEXT:    flat_store_dword v[0:1], v2
17688; GFX7-NEXT:    s_endpgm
17689;
17690; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17691; GFX10-WGP:       ; %bb.0: ; %entry
17692; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17693; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17694; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17695; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17696; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17697; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17698; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17699; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17700; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17701; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17702; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17703; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17704; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17705; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17706; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17707; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17708; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17709; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17710; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17711; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17712; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17713; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17714; GFX10-WGP-NEXT:    buffer_gl0_inv
17715; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17716; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17717; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17718; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17719; GFX10-WGP-NEXT:    s_endpgm
17720;
17721; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17722; GFX10-CU:       ; %bb.0: ; %entry
17723; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17724; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17725; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17726; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17727; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17728; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17729; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17730; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17731; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17732; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17733; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17734; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17735; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17736; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17737; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17738; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17739; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17740; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17741; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17742; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17743; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17744; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17745; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17746; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17747; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17748; GFX10-CU-NEXT:    s_endpgm
17749;
17750; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17751; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17752; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17753; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17754; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17755; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17756; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17757; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17758; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17759; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17760; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17761; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17762; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17763; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17764; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17765; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17767; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17768; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17769; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17770; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17771; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17772; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17775; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17776; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17777; SKIP-CACHE-INV-NEXT:    s_endpgm
17778;
17779; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17780; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17781; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17782; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17783; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17784; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17785; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17786; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17787; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17788; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17789; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17790; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17791; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17792; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17793; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17794; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17795;
17796; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17797; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17798; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17799; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17800; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17801; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17802; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17803; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17804; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17805; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17806; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17807; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17808; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17809; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17810; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17811; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17812; GFX90A-TGSPLIT-NEXT:    s_endpgm
17813;
17814; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17815; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17816; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17817; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17818; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17819; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17820; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17821; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17822; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17823; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17824; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17825; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17826; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17827; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17828; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17829; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17830;
17831; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17832; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17833; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17834; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17835; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17836; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17837; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17838; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17839; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17840; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17841; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17842; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17843; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17844; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
17845; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17846; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17847; GFX940-TGSPLIT-NEXT:    s_endpgm
17848;
17849; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17850; GFX11-WGP:       ; %bb.0: ; %entry
17851; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17852; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17853; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17854; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17855; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17856; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17857; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17858; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17859; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17860; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17861; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17862; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17863; GFX11-WGP-NEXT:    buffer_gl0_inv
17864; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17865; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17866; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17867; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17868; GFX11-WGP-NEXT:    s_endpgm
17869;
17870; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17871; GFX11-CU:       ; %bb.0: ; %entry
17872; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17873; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17874; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17875; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17876; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17877; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17878; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17879; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17880; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17881; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17882; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17883; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17884; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17885; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17886; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17887; GFX11-CU-NEXT:    s_endpgm
17888;
17889; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17890; GFX12-WGP:       ; %bb.0: ; %entry
17891; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17892; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17893; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17894; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17895; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17896; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17897; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17898; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17899; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17900; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17901; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17902; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17903; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
17904; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17905; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17906; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
17907; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17908; GFX12-WGP-NEXT:    s_endpgm
17909;
17910; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17911; GFX12-CU:       ; %bb.0: ; %entry
17912; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17913; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17914; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17915; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17916; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17917; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17918; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17919; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17920; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17921; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17922; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17923; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17924; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17925; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17926; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17927; GFX12-CU-NEXT:    s_endpgm
17928    ptr %out, i32 %in, i32 %old) {
17929entry:
17930  %gep = getelementptr i32, ptr %out, i32 4
17931  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
17932  %val0 = extractvalue { i32, i1 } %val, 0
17933  store i32 %val0, ptr %out, align 4
17934  ret void
17935}
17936
17937define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
17938; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
17939; GFX7:       ; %bb.0: ; %entry
17940; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17941; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17942; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17943; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17944; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17945; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17946; GFX7-NEXT:    s_mov_b32 s6, s4
17947; GFX7-NEXT:    s_mov_b32 s7, s5
17948; GFX7-NEXT:    s_mov_b32 s11, s12
17949; GFX7-NEXT:    s_mov_b32 s10, s13
17950; GFX7-NEXT:    s_add_u32 s6, s6, s11
17951; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17952; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17953; GFX7-NEXT:    s_mov_b32 s7, s10
17954; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17955; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17956; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17957; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17958; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17959; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17960; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17961; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17962; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17963; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17964; GFX7-NEXT:    flat_store_dword v[0:1], v2
17965; GFX7-NEXT:    s_endpgm
17966;
17967; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
17968; GFX10-WGP:       ; %bb.0: ; %entry
17969; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17970; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17971; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17972; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17973; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17974; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17975; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17976; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17977; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17978; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17979; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17980; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17981; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17982; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17983; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17984; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17985; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17986; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17987; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17988; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17989; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17990; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17991; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17992; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17993; GFX10-WGP-NEXT:    buffer_gl0_inv
17994; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17995; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17996; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17997; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17998; GFX10-WGP-NEXT:    s_endpgm
17999;
18000; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18001; GFX10-CU:       ; %bb.0: ; %entry
18002; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18003; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18004; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18005; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18006; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18007; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18008; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18009; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18010; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18011; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18012; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18013; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18014; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18015; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18016; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18017; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18018; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18019; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18020; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18021; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18022; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18023; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18024; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18025; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18026; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18027; GFX10-CU-NEXT:    s_endpgm
18028;
18029; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18030; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18031; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18032; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18033; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18034; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18035; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18036; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18037; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18038; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18039; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18040; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18041; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18042; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18043; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18044; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18045; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18046; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18047; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18048; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18050; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18051; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18053; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18054; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18055; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18056; SKIP-CACHE-INV-NEXT:    s_endpgm
18057;
18058; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18059; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18060; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18061; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18062; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18063; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18064; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18065; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18066; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18067; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18068; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18069; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18070; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18071; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18072; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18073; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18074;
18075; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18076; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18077; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18078; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18079; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18080; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18081; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18082; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18083; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18084; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18085; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18086; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18087; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18088; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18089; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18090; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18091; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18092; GFX90A-TGSPLIT-NEXT:    s_endpgm
18093;
18094; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18095; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18096; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18097; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18098; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18099; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18100; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18101; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18102; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18103; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18104; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18105; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18106; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18107; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18108; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18109; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18110;
18111; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18112; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18113; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18114; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18115; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18116; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18117; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18118; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18119; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18120; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18121; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18122; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18123; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18124; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18125; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
18126; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18127; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18128; GFX940-TGSPLIT-NEXT:    s_endpgm
18129;
18130; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18131; GFX11-WGP:       ; %bb.0: ; %entry
18132; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18133; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18134; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18135; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18136; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18137; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18138; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18139; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18140; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18141; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18142; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18143; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18144; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18145; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18146; GFX11-WGP-NEXT:    buffer_gl0_inv
18147; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18148; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18149; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18150; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18151; GFX11-WGP-NEXT:    s_endpgm
18152;
18153; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18154; GFX11-CU:       ; %bb.0: ; %entry
18155; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18156; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18157; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18158; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18159; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18160; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18161; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18162; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18163; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18164; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18165; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18166; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18167; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18168; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18169; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18170; GFX11-CU-NEXT:    s_endpgm
18171;
18172; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18173; GFX12-WGP:       ; %bb.0: ; %entry
18174; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18175; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18176; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18177; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18178; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18179; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18180; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18181; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18182; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18183; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18184; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18185; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18186; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18187; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18188; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18189; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18190; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18191; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18192; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
18193; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18194; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18195; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
18196; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18197; GFX12-WGP-NEXT:    s_endpgm
18198;
18199; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18200; GFX12-CU:       ; %bb.0: ; %entry
18201; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18202; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18203; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18204; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18205; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18206; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18207; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18208; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18209; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18210; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18211; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18212; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18213; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18214; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18215; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18216; GFX12-CU-NEXT:    s_endpgm
18217    ptr %out, i32 %in, i32 %old) {
18218entry:
18219  %gep = getelementptr i32, ptr %out, i32 4
18220  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
18221  %val0 = extractvalue { i32, i1 } %val, 0
18222  store i32 %val0, ptr %out, align 4
18223  ret void
18224}
18225
18226define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
18227; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18228; GFX7:       ; %bb.0: ; %entry
18229; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18230; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18231; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18232; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18233; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18234; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18235; GFX7-NEXT:    s_mov_b32 s6, s4
18236; GFX7-NEXT:    s_mov_b32 s7, s5
18237; GFX7-NEXT:    s_mov_b32 s11, s12
18238; GFX7-NEXT:    s_mov_b32 s10, s13
18239; GFX7-NEXT:    s_add_u32 s6, s6, s11
18240; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18241; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18242; GFX7-NEXT:    s_mov_b32 s7, s10
18243; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18244; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18245; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18246; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18247; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18248; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18249; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18250; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18251; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18252; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18253; GFX7-NEXT:    flat_store_dword v[0:1], v2
18254; GFX7-NEXT:    s_endpgm
18255;
18256; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18257; GFX10-WGP:       ; %bb.0: ; %entry
18258; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18259; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18260; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18261; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18262; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18263; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18264; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18265; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18266; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18267; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18268; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18269; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18270; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18271; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18272; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18273; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18274; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18275; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18276; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18277; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18278; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18279; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18280; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18281; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18282; GFX10-WGP-NEXT:    buffer_gl0_inv
18283; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18284; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18285; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18286; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18287; GFX10-WGP-NEXT:    s_endpgm
18288;
18289; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18290; GFX10-CU:       ; %bb.0: ; %entry
18291; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18292; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18293; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18294; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18295; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18296; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18297; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18298; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18299; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18300; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18301; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18302; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18303; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18304; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18305; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18306; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18307; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18308; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18309; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18310; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18311; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18312; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18313; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18314; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18315; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18316; GFX10-CU-NEXT:    s_endpgm
18317;
18318; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18319; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18320; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18321; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18322; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18323; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18324; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18325; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18326; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18327; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18328; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18329; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18330; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18331; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18332; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18333; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18334; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18335; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18336; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18337; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18338; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18339; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18340; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18341; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18342; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18343; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18344; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18345; SKIP-CACHE-INV-NEXT:    s_endpgm
18346;
18347; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18348; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18349; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18350; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18351; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18352; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18353; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18354; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18355; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18356; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18357; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18358; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18359; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18360; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18361; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18362; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18363;
18364; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18365; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18366; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18367; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18368; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18369; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18370; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18371; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18372; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18373; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18374; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18375; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18376; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18377; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18378; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18379; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18380; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18381; GFX90A-TGSPLIT-NEXT:    s_endpgm
18382;
18383; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18384; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18385; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18386; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18387; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18388; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18389; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18390; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18391; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18392; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18393; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18394; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18395; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18396; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18397; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18398; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18399;
18400; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18401; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18402; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18403; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18404; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18405; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18406; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18407; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18408; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18409; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18410; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18411; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18412; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18413; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18414; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
18415; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18416; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18417; GFX940-TGSPLIT-NEXT:    s_endpgm
18418;
18419; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18420; GFX11-WGP:       ; %bb.0: ; %entry
18421; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18422; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18423; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18424; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18425; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18426; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18427; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18428; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18429; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18430; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18431; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18432; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18433; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18434; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18435; GFX11-WGP-NEXT:    buffer_gl0_inv
18436; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18437; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18438; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18439; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18440; GFX11-WGP-NEXT:    s_endpgm
18441;
18442; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18443; GFX11-CU:       ; %bb.0: ; %entry
18444; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18445; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18446; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18447; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18448; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18449; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18450; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18451; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18452; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18453; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18454; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18455; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18456; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18457; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18458; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18459; GFX11-CU-NEXT:    s_endpgm
18460;
18461; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18462; GFX12-WGP:       ; %bb.0: ; %entry
18463; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18464; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18465; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18466; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18467; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18468; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18469; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18470; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18471; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18472; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18473; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18474; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18475; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18476; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18477; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18478; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18479; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18480; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18481; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
18482; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18483; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18484; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
18485; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18486; GFX12-WGP-NEXT:    s_endpgm
18487;
18488; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18489; GFX12-CU:       ; %bb.0: ; %entry
18490; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18491; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18492; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18493; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18494; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18495; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18496; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18497; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18498; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18499; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18500; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18501; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18502; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18503; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18504; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18505; GFX12-CU-NEXT:    s_endpgm
18506    ptr %out, i32 %in, i32 %old) {
18507entry:
18508  %gep = getelementptr i32, ptr %out, i32 4
18509  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
18510  %val0 = extractvalue { i32, i1 } %val, 0
18511  store i32 %val0, ptr %out, align 4
18512  ret void
18513}
18514
18515define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
18516; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18517; GFX7:       ; %bb.0: ; %entry
18518; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18519; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18520; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18521; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18522; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18523; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18524; GFX7-NEXT:    s_mov_b32 s6, s4
18525; GFX7-NEXT:    s_mov_b32 s7, s5
18526; GFX7-NEXT:    s_mov_b32 s11, s12
18527; GFX7-NEXT:    s_mov_b32 s10, s13
18528; GFX7-NEXT:    s_add_u32 s6, s6, s11
18529; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18530; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18531; GFX7-NEXT:    s_mov_b32 s7, s10
18532; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18533; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18534; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18535; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18536; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18537; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18538; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18539; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18540; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18541; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18542; GFX7-NEXT:    flat_store_dword v[0:1], v2
18543; GFX7-NEXT:    s_endpgm
18544;
18545; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18546; GFX10-WGP:       ; %bb.0: ; %entry
18547; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18548; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18549; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18550; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18551; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18552; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18553; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18554; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18555; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18556; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18557; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18558; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18559; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18560; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18561; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18562; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18563; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18564; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18565; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18566; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18567; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18568; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18569; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18570; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18571; GFX10-WGP-NEXT:    buffer_gl0_inv
18572; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18573; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18574; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18575; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18576; GFX10-WGP-NEXT:    s_endpgm
18577;
18578; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18579; GFX10-CU:       ; %bb.0: ; %entry
18580; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18581; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18582; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18583; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18584; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18585; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18586; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18587; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18588; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18589; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18590; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18591; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18592; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18593; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18594; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18595; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18596; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18597; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18598; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18599; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18600; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18601; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18602; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18603; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18604; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18605; GFX10-CU-NEXT:    s_endpgm
18606;
18607; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18608; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18609; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18610; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18611; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18612; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18613; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18614; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18615; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18616; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18617; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18618; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18619; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18620; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18621; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18622; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18623; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18624; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18625; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18626; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18627; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18628; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18629; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18630; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18631; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18632; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18633; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18634; SKIP-CACHE-INV-NEXT:    s_endpgm
18635;
18636; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18637; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18638; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18639; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18640; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18641; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18642; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18643; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18644; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18645; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18646; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18647; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18648; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18649; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18650; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18651; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18652;
18653; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18654; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18655; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18656; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18657; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18658; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18659; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18660; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18661; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18662; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18663; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18664; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18665; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18666; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18667; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18668; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18669; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18670; GFX90A-TGSPLIT-NEXT:    s_endpgm
18671;
18672; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18673; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18674; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18675; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18676; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18677; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18678; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18679; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18680; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18682; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18683; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18685; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18686; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18687; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18688;
18689; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18690; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18691; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18692; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18693; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18694; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18695; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18696; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18697; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18698; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18699; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18700; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18701; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18702; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18703; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
18704; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18705; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18706; GFX940-TGSPLIT-NEXT:    s_endpgm
18707;
18708; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18709; GFX11-WGP:       ; %bb.0: ; %entry
18710; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18711; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18712; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18713; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18714; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18715; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18716; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18717; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18718; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18719; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18720; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18721; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18722; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18723; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18724; GFX11-WGP-NEXT:    buffer_gl0_inv
18725; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18726; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18727; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18728; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18729; GFX11-WGP-NEXT:    s_endpgm
18730;
18731; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18732; GFX11-CU:       ; %bb.0: ; %entry
18733; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18734; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18735; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18736; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18737; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18738; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18739; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18740; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18741; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18742; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18743; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18744; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18745; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18746; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18747; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18748; GFX11-CU-NEXT:    s_endpgm
18749;
18750; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18751; GFX12-WGP:       ; %bb.0: ; %entry
18752; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18753; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18754; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18755; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18756; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18757; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18758; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18759; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18760; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18761; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18762; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18763; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18764; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18765; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18766; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18767; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18768; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18769; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18770; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
18771; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18772; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18773; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
18774; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18775; GFX12-WGP-NEXT:    s_endpgm
18776;
18777; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18778; GFX12-CU:       ; %bb.0: ; %entry
18779; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18780; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18781; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18782; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18783; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18784; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18785; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18786; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18787; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18788; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18789; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18790; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18791; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18792; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18793; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18794; GFX12-CU-NEXT:    s_endpgm
18795    ptr %out, i32 %in, i32 %old) {
18796entry:
18797  %gep = getelementptr i32, ptr %out, i32 4
18798  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
18799  %val0 = extractvalue { i32, i1 } %val, 0
18800  store i32 %val0, ptr %out, align 4
18801  ret void
18802}
18803
18804define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
18805; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18806; GFX7:       ; %bb.0: ; %entry
18807; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18808; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18809; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18810; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18811; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18812; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18813; GFX7-NEXT:    s_mov_b32 s6, s4
18814; GFX7-NEXT:    s_mov_b32 s7, s5
18815; GFX7-NEXT:    s_mov_b32 s11, s12
18816; GFX7-NEXT:    s_mov_b32 s10, s13
18817; GFX7-NEXT:    s_add_u32 s6, s6, s11
18818; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18819; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18820; GFX7-NEXT:    s_mov_b32 s7, s10
18821; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18822; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18823; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18824; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18825; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18826; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18827; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18828; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18829; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18830; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18831; GFX7-NEXT:    flat_store_dword v[0:1], v2
18832; GFX7-NEXT:    s_endpgm
18833;
18834; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18835; GFX10-WGP:       ; %bb.0: ; %entry
18836; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18837; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18838; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18839; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18840; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18841; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18842; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18843; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18844; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18845; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18846; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18847; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18848; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18849; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18850; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18851; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18852; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18853; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18854; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18855; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18856; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18857; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18858; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18859; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18860; GFX10-WGP-NEXT:    buffer_gl0_inv
18861; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18862; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18863; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18864; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18865; GFX10-WGP-NEXT:    s_endpgm
18866;
18867; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18868; GFX10-CU:       ; %bb.0: ; %entry
18869; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18870; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18871; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18872; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18873; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18874; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18875; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18876; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18877; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18878; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18879; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18880; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18881; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18882; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18883; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18884; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18885; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18886; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18887; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18888; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18889; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18890; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18891; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18892; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18893; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18894; GFX10-CU-NEXT:    s_endpgm
18895;
18896; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18897; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18898; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18899; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18900; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18901; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18902; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18903; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18904; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18905; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18906; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18907; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18908; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18909; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18910; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18911; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18913; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18914; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18918; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18919; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18920; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18921; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18922; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18923; SKIP-CACHE-INV-NEXT:    s_endpgm
18924;
18925; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18926; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18927; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18928; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18929; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18930; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18931; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18932; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18933; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18934; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18935; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18936; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18937; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18938; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18939; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18940; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18941;
18942; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18943; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18944; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18945; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18946; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18947; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18948; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18949; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18950; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18951; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18952; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18953; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18954; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18955; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18956; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18957; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18958; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18959; GFX90A-TGSPLIT-NEXT:    s_endpgm
18960;
18961; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18962; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18963; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18964; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18965; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18966; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18967; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18968; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18969; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18970; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18971; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18972; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18973; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18974; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18975; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18976; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18977;
18978; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18979; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18980; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18981; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18982; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18983; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18984; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18985; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18986; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18987; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18988; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18989; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18990; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18991; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18992; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
18993; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18994; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18995; GFX940-TGSPLIT-NEXT:    s_endpgm
18996;
18997; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18998; GFX11-WGP:       ; %bb.0: ; %entry
18999; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19000; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19001; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19002; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19003; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19004; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19005; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19006; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19007; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19008; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19009; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19010; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19011; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19012; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19013; GFX11-WGP-NEXT:    buffer_gl0_inv
19014; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19015; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19016; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19017; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19018; GFX11-WGP-NEXT:    s_endpgm
19019;
19020; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19021; GFX11-CU:       ; %bb.0: ; %entry
19022; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19023; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19024; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19025; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19026; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19027; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19028; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19029; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19030; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19031; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19032; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19033; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19034; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19035; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19036; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19037; GFX11-CU-NEXT:    s_endpgm
19038;
19039; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19040; GFX12-WGP:       ; %bb.0: ; %entry
19041; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19042; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19043; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19044; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19045; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19046; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19047; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19048; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19049; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19050; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19051; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19052; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19053; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19054; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19055; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19056; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19057; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19058; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19059; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
19060; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19061; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19062; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19063; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19064; GFX12-WGP-NEXT:    s_endpgm
19065;
19066; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19067; GFX12-CU:       ; %bb.0: ; %entry
19068; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19069; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19070; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19071; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19072; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19073; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19074; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19075; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19076; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19077; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19078; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19079; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19080; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19081; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19082; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19083; GFX12-CU-NEXT:    s_endpgm
19084    ptr %out, i32 %in, i32 %old) {
19085entry:
19086  %gep = getelementptr i32, ptr %out, i32 4
19087  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
19088  %val0 = extractvalue { i32, i1 } %val, 0
19089  store i32 %val0, ptr %out, align 4
19090  ret void
19091}
19092
19093define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
19094; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19095; GFX7:       ; %bb.0: ; %entry
19096; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19097; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19098; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19099; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19100; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19101; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19102; GFX7-NEXT:    s_mov_b32 s6, s4
19103; GFX7-NEXT:    s_mov_b32 s7, s5
19104; GFX7-NEXT:    s_mov_b32 s11, s12
19105; GFX7-NEXT:    s_mov_b32 s10, s13
19106; GFX7-NEXT:    s_add_u32 s6, s6, s11
19107; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19108; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19109; GFX7-NEXT:    s_mov_b32 s7, s10
19110; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19111; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19112; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19113; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19114; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19115; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19116; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19117; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19118; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19119; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19120; GFX7-NEXT:    flat_store_dword v[0:1], v2
19121; GFX7-NEXT:    s_endpgm
19122;
19123; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19124; GFX10-WGP:       ; %bb.0: ; %entry
19125; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19126; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19127; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19128; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19129; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19130; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19131; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19132; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19133; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19134; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19135; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19136; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19137; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19138; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19139; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19140; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19141; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19142; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19143; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19144; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19145; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19146; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19147; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19148; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19149; GFX10-WGP-NEXT:    buffer_gl0_inv
19150; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19151; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19152; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19153; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19154; GFX10-WGP-NEXT:    s_endpgm
19155;
19156; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19157; GFX10-CU:       ; %bb.0: ; %entry
19158; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19159; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19160; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19161; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19162; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19163; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19164; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19165; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19166; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19167; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19168; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19169; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19170; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19171; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19172; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19173; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19174; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19175; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19176; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19177; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19178; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19179; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19180; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19181; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19182; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19183; GFX10-CU-NEXT:    s_endpgm
19184;
19185; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19186; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19187; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19188; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19189; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19190; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19191; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19192; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19193; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19194; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19195; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19196; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19197; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19198; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19199; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19200; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19201; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19202; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19203; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19206; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19207; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19210; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19211; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19212; SKIP-CACHE-INV-NEXT:    s_endpgm
19213;
19214; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19215; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19216; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19217; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19218; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19219; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19220; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19221; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19222; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19223; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19224; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19225; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19226; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19227; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19228; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19229; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19230;
19231; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19232; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19233; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19234; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19235; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19236; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19237; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19238; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19239; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19240; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19241; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19242; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19243; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19244; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19245; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
19246; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19247; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19248; GFX90A-TGSPLIT-NEXT:    s_endpgm
19249;
19250; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19251; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19252; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19253; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19254; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19255; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19256; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19257; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19258; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19259; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19260; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19261; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19262; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19263; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19264; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19265; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19266;
19267; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19268; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19269; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19270; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19271; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19272; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19273; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19274; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19275; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19276; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19277; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19278; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19279; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19280; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19281; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
19282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19283; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19284; GFX940-TGSPLIT-NEXT:    s_endpgm
19285;
19286; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19287; GFX11-WGP:       ; %bb.0: ; %entry
19288; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19289; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19290; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19291; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19292; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19293; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19294; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19295; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19296; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19297; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19298; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19299; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19300; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19301; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19302; GFX11-WGP-NEXT:    buffer_gl0_inv
19303; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19304; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19305; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19306; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19307; GFX11-WGP-NEXT:    s_endpgm
19308;
19309; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19310; GFX11-CU:       ; %bb.0: ; %entry
19311; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19312; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19313; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19314; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19315; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19316; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19317; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19318; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19319; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19320; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19321; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19322; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19323; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19324; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19325; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19326; GFX11-CU-NEXT:    s_endpgm
19327;
19328; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19329; GFX12-WGP:       ; %bb.0: ; %entry
19330; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19331; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19332; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19333; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19334; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19335; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19336; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19337; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19338; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19339; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19340; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19341; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19342; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19343; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19344; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19345; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19346; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
19347; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19348; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19349; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19350; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19351; GFX12-WGP-NEXT:    s_endpgm
19352;
19353; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19354; GFX12-CU:       ; %bb.0: ; %entry
19355; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19356; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19357; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19358; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19359; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19360; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19361; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19362; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19363; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19364; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19365; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19366; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19367; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19368; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19369; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19370; GFX12-CU-NEXT:    s_endpgm
19371    ptr %out, i32 %in, i32 %old) {
19372entry:
19373  %gep = getelementptr i32, ptr %out, i32 4
19374  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
19375  %val0 = extractvalue { i32, i1 } %val, 0
19376  store i32 %val0, ptr %out, align 4
19377  ret void
19378}
19379
19380define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
19381; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19382; GFX7:       ; %bb.0: ; %entry
19383; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19384; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19385; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19386; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19387; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19388; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19389; GFX7-NEXT:    s_mov_b32 s6, s4
19390; GFX7-NEXT:    s_mov_b32 s7, s5
19391; GFX7-NEXT:    s_mov_b32 s11, s12
19392; GFX7-NEXT:    s_mov_b32 s10, s13
19393; GFX7-NEXT:    s_add_u32 s6, s6, s11
19394; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19395; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19396; GFX7-NEXT:    s_mov_b32 s7, s10
19397; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19398; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19399; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19400; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19401; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19402; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19403; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19404; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19405; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19406; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19407; GFX7-NEXT:    flat_store_dword v[0:1], v2
19408; GFX7-NEXT:    s_endpgm
19409;
19410; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19411; GFX10-WGP:       ; %bb.0: ; %entry
19412; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19413; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19414; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19415; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19416; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19417; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19418; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19419; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19420; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19421; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19422; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19423; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19424; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19425; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19426; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19427; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19428; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19429; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19430; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19431; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19432; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19433; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19434; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19435; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19436; GFX10-WGP-NEXT:    buffer_gl0_inv
19437; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19438; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19439; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19440; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19441; GFX10-WGP-NEXT:    s_endpgm
19442;
19443; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19444; GFX10-CU:       ; %bb.0: ; %entry
19445; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19446; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19447; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19448; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19449; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19450; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19451; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19452; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19453; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19454; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19455; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19456; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19457; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19458; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19459; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19460; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19461; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19462; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19463; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19464; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19465; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19466; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19467; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19468; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19469; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19470; GFX10-CU-NEXT:    s_endpgm
19471;
19472; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19473; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19474; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19475; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19476; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19477; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19478; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19479; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19481; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19484; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19485; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19486; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19487; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19490; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19491; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19493; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19494; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19497; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19498; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19499; SKIP-CACHE-INV-NEXT:    s_endpgm
19500;
19501; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19502; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19503; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19505; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19506; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19507; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19508; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19509; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19510; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19511; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19512; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19513; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19514; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19515; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19516; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19517;
19518; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19519; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19520; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19521; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19522; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19523; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19524; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19525; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19526; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19527; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19528; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19529; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19530; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19531; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19532; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
19533; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19534; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19535; GFX90A-TGSPLIT-NEXT:    s_endpgm
19536;
19537; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19538; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19539; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19540; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19541; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19542; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19543; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19544; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19545; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19546; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19547; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19548; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19549; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19550; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19551; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19552; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19553;
19554; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19555; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19556; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19557; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19558; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19559; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19560; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19561; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19562; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19563; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19564; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19565; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19566; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19567; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19568; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
19569; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19570; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19571; GFX940-TGSPLIT-NEXT:    s_endpgm
19572;
19573; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19574; GFX11-WGP:       ; %bb.0: ; %entry
19575; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19576; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19577; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19578; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19579; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19580; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19581; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19582; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19583; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19584; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19585; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19586; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19587; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19588; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19589; GFX11-WGP-NEXT:    buffer_gl0_inv
19590; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19591; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19592; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19593; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19594; GFX11-WGP-NEXT:    s_endpgm
19595;
19596; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19597; GFX11-CU:       ; %bb.0: ; %entry
19598; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19599; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19600; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19601; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19602; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19603; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19604; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19605; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19606; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19607; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19608; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19609; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19610; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19611; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19612; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19613; GFX11-CU-NEXT:    s_endpgm
19614;
19615; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19616; GFX12-WGP:       ; %bb.0: ; %entry
19617; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19618; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19619; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19620; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19621; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19622; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19623; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19624; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19625; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19626; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19627; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19628; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19629; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19630; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19631; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19632; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19633; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19634; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19635; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
19636; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19637; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19638; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19639; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19640; GFX12-WGP-NEXT:    s_endpgm
19641;
19642; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19643; GFX12-CU:       ; %bb.0: ; %entry
19644; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19645; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19646; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19647; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19648; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19649; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19650; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19651; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19652; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19653; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19654; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19655; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19656; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19657; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19658; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19659; GFX12-CU-NEXT:    s_endpgm
19660    ptr %out, i32 %in, i32 %old) {
19661entry:
19662  %gep = getelementptr i32, ptr %out, i32 4
19663  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
19664  %val0 = extractvalue { i32, i1 } %val, 0
19665  store i32 %val0, ptr %out, align 4
19666  ret void
19667}
19668
19669define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
19670; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19671; GFX7:       ; %bb.0: ; %entry
19672; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19673; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19674; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19675; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19676; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19677; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19678; GFX7-NEXT:    s_mov_b32 s6, s4
19679; GFX7-NEXT:    s_mov_b32 s7, s5
19680; GFX7-NEXT:    s_mov_b32 s11, s12
19681; GFX7-NEXT:    s_mov_b32 s10, s13
19682; GFX7-NEXT:    s_add_u32 s6, s6, s11
19683; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19684; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19685; GFX7-NEXT:    s_mov_b32 s7, s10
19686; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19687; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19688; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19689; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19690; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19691; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19692; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19693; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19694; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19695; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19696; GFX7-NEXT:    flat_store_dword v[0:1], v2
19697; GFX7-NEXT:    s_endpgm
19698;
19699; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19700; GFX10-WGP:       ; %bb.0: ; %entry
19701; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19702; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19703; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19704; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19705; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19706; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19707; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19708; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19709; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19710; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19711; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19712; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19713; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19714; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19715; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19717; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19718; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19719; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19720; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19721; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19722; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19723; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19724; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19725; GFX10-WGP-NEXT:    buffer_gl0_inv
19726; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19727; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19728; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19729; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19730; GFX10-WGP-NEXT:    s_endpgm
19731;
19732; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19733; GFX10-CU:       ; %bb.0: ; %entry
19734; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19735; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19736; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19737; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19738; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19739; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19740; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19741; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19742; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19743; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19744; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19745; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19746; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19747; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19748; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19749; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19750; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19751; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19752; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19753; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19754; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19756; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19757; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19758; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19759; GFX10-CU-NEXT:    s_endpgm
19760;
19761; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19762; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19763; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19764; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19765; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19766; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19767; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19768; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19769; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19770; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19771; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19772; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19773; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19774; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19775; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19776; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19779; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19783; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19786; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19787; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19788; SKIP-CACHE-INV-NEXT:    s_endpgm
19789;
19790; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19791; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19792; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19793; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19794; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19795; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19796; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19797; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19798; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19799; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19800; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19801; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19802; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19803; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19804; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19805; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19806;
19807; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19808; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19809; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19810; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19811; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19812; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19813; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19814; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19815; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19816; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19817; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19818; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19819; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19820; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19821; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
19822; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19823; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19824; GFX90A-TGSPLIT-NEXT:    s_endpgm
19825;
19826; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19827; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19828; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19829; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19830; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19831; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19832; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19833; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19834; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19835; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19836; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19837; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19838; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19839; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19840; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19841; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19842;
19843; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19844; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19845; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19846; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19847; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19848; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19849; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19850; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19851; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19852; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19853; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19854; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19855; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19856; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19857; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
19858; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19859; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19860; GFX940-TGSPLIT-NEXT:    s_endpgm
19861;
19862; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19863; GFX11-WGP:       ; %bb.0: ; %entry
19864; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19865; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19866; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19867; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19868; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19869; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19870; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19871; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19872; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19873; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19874; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19875; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19876; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19877; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19878; GFX11-WGP-NEXT:    buffer_gl0_inv
19879; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19880; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19881; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19882; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19883; GFX11-WGP-NEXT:    s_endpgm
19884;
19885; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19886; GFX11-CU:       ; %bb.0: ; %entry
19887; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19888; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19889; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19890; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19891; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19892; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19893; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19894; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19895; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19896; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19897; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19898; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19899; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19900; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19901; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19902; GFX11-CU-NEXT:    s_endpgm
19903;
19904; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19905; GFX12-WGP:       ; %bb.0: ; %entry
19906; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19907; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19908; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19909; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19910; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19911; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19912; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19913; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19914; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19915; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19916; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19917; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19918; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19919; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19920; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19921; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19922; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19923; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19924; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
19925; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19926; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19927; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19928; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19929; GFX12-WGP-NEXT:    s_endpgm
19930;
19931; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19932; GFX12-CU:       ; %bb.0: ; %entry
19933; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19934; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19935; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19936; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19937; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19938; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19939; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19940; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19941; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19942; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19943; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19944; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19945; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19946; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19947; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19948; GFX12-CU-NEXT:    s_endpgm
19949    ptr %out, i32 %in, i32 %old) {
19950entry:
19951  %gep = getelementptr i32, ptr %out, i32 4
19952  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
19953  %val0 = extractvalue { i32, i1 } %val, 0
19954  store i32 %val0, ptr %out, align 4
19955  ret void
19956}
19957
19958define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
19959; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
19960; GFX7:       ; %bb.0: ; %entry
19961; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19962; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19963; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19964; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19965; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19966; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19967; GFX7-NEXT:    s_mov_b32 s6, s4
19968; GFX7-NEXT:    s_mov_b32 s7, s5
19969; GFX7-NEXT:    s_mov_b32 s11, s12
19970; GFX7-NEXT:    s_mov_b32 s10, s13
19971; GFX7-NEXT:    s_add_u32 s6, s6, s11
19972; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19973; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19974; GFX7-NEXT:    s_mov_b32 s7, s10
19975; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19976; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19977; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19978; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19979; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19980; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19981; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19982; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19983; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19984; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19985; GFX7-NEXT:    flat_store_dword v[0:1], v2
19986; GFX7-NEXT:    s_endpgm
19987;
19988; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
19989; GFX10-WGP:       ; %bb.0: ; %entry
19990; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19991; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19992; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19993; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19994; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19995; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19996; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19997; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19998; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19999; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
20000; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
20001; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
20002; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20003; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
20004; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
20005; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
20006; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20007; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
20008; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
20009; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
20010; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20011; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
20012; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20013; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20014; GFX10-WGP-NEXT:    buffer_gl0_inv
20015; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
20016; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
20017; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20018; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
20019; GFX10-WGP-NEXT:    s_endpgm
20020;
20021; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20022; GFX10-CU:       ; %bb.0: ; %entry
20023; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
20024; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20025; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
20026; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
20027; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
20028; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20029; GFX10-CU-NEXT:    s_mov_b32 s6, s4
20030; GFX10-CU-NEXT:    s_mov_b32 s7, s5
20031; GFX10-CU-NEXT:    s_mov_b32 s11, s12
20032; GFX10-CU-NEXT:    s_mov_b32 s10, s13
20033; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
20034; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
20035; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20036; GFX10-CU-NEXT:    s_mov_b32 s7, s10
20037; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
20038; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
20039; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20040; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
20041; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
20042; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
20043; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20044; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
20045; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
20046; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20047; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
20048; GFX10-CU-NEXT:    s_endpgm
20049;
20050; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20051; SKIP-CACHE-INV:       ; %bb.0: ; %entry
20052; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
20053; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
20054; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
20055; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
20056; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
20057; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20058; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
20059; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
20060; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
20061; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
20062; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
20063; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
20064; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20065; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
20066; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
20067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
20068; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20069; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
20070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
20071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
20072; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
20074; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
20075; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20076; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
20077; SKIP-CACHE-INV-NEXT:    s_endpgm
20078;
20079; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20080; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
20081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20082; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20083; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20084; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20085; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20086; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20087; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20088; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20089; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20090; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20091; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20092; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20093; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20094; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
20095;
20096; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20097; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
20098; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20099; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20100; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20101; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20102; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20103; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20104; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20105; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20106; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20107; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20108; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20109; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20110; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
20111; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20112; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20113; GFX90A-TGSPLIT-NEXT:    s_endpgm
20114;
20115; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20116; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
20117; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20118; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20119; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20120; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20121; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20122; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20123; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20124; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20125; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20126; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
20127; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20128; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20129; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20130; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
20131;
20132; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20133; GFX940-TGSPLIT:       ; %bb.0: ; %entry
20134; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20135; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20136; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20137; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20138; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20139; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20140; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20141; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20142; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20143; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20144; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
20145; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20146; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
20147; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20148; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20149; GFX940-TGSPLIT-NEXT:    s_endpgm
20150;
20151; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20152; GFX11-WGP:       ; %bb.0: ; %entry
20153; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20154; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20155; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20156; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20157; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
20158; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
20159; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20160; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
20161; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20162; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20163; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20164; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
20165; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20166; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20167; GFX11-WGP-NEXT:    buffer_gl0_inv
20168; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20169; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20170; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20171; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
20172; GFX11-WGP-NEXT:    s_endpgm
20173;
20174; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20175; GFX11-CU:       ; %bb.0: ; %entry
20176; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20177; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20178; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20179; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20180; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
20181; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
20182; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20183; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
20184; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20185; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20186; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20187; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20188; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20189; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
20190; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
20191; GFX11-CU-NEXT:    s_endpgm
20192;
20193; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20194; GFX12-WGP:       ; %bb.0: ; %entry
20195; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20196; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20197; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20198; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
20199; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
20200; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
20201; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20202; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
20203; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20204; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20205; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
20206; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
20207; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20208; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
20209; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
20210; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
20211; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
20212; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20213; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
20214; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20215; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20216; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
20217; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
20218; GFX12-WGP-NEXT:    s_endpgm
20219;
20220; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20221; GFX12-CU:       ; %bb.0: ; %entry
20222; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20223; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20224; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20225; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
20226; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
20227; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
20228; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20229; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
20230; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20231; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20232; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
20233; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20234; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20235; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
20236; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
20237; GFX12-CU-NEXT:    s_endpgm
20238    ptr %out, i32 %in, i32 %old) {
20239entry:
20240  %gep = getelementptr i32, ptr %out, i32 4
20241  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
20242  %val0 = extractvalue { i32, i1 } %val, 0
20243  store i32 %val0, ptr %out, align 4
20244  ret void
20245}
20246