xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
14
15define amdgpu_kernel void @flat_system_unordered_load(
16; GFX7-LABEL: flat_system_unordered_load:
17; GFX7:       ; %bb.0: ; %entry
18; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
19; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22; GFX7-NEXT:    v_mov_b32_e32 v1, s7
23; GFX7-NEXT:    flat_load_dword v2, v[0:1]
24; GFX7-NEXT:    v_mov_b32_e32 v0, s4
25; GFX7-NEXT:    v_mov_b32_e32 v1, s5
26; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
27; GFX7-NEXT:    flat_store_dword v[0:1], v2
28; GFX7-NEXT:    s_endpgm
29;
30; GFX10-WGP-LABEL: flat_system_unordered_load:
31; GFX10-WGP:       ; %bb.0: ; %entry
32; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
33; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
34; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
36; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
37; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
38; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
39; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
40; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
41; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
42; GFX10-WGP-NEXT:    s_endpgm
43;
44; GFX10-CU-LABEL: flat_system_unordered_load:
45; GFX10-CU:       ; %bb.0: ; %entry
46; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
47; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
48; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
50; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
51; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
53; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
54; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
55; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
56; GFX10-CU-NEXT:    s_endpgm
57;
58; SKIP-CACHE-INV-LABEL: flat_system_unordered_load:
59; SKIP-CACHE-INV:       ; %bb.0: ; %entry
60; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
64; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
65; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
66; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
67; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
68; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
69; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
70; SKIP-CACHE-INV-NEXT:    s_endpgm
71;
72; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
73; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
74; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
75; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
76; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
78; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
79; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
80; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
81; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
82; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
83;
84; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
85; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
86; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
87; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
90; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
91; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
92; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
93; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
94; GFX90A-TGSPLIT-NEXT:    s_endpgm
95;
96; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_load:
97; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
98; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
99; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
100; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
102; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
103; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
104; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
105; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
106; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
107;
108; GFX940-TGSPLIT-LABEL: flat_system_unordered_load:
109; GFX940-TGSPLIT:       ; %bb.0: ; %entry
110; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
111; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
112; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
114; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
115; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
116; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
117; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
118; GFX940-TGSPLIT-NEXT:    s_endpgm
119;
120; GFX11-WGP-LABEL: flat_system_unordered_load:
121; GFX11-WGP:       ; %bb.0: ; %entry
122; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
123; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
124; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
126; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
127; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
128; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
129; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
130; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
132; GFX11-WGP-NEXT:    s_endpgm
133;
134; GFX11-CU-LABEL: flat_system_unordered_load:
135; GFX11-CU:       ; %bb.0: ; %entry
136; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
137; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
138; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
140; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
141; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
142; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
143; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
144; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
145; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
146; GFX11-CU-NEXT:    s_endpgm
147;
148; GFX12-WGP-LABEL: flat_system_unordered_load:
149; GFX12-WGP:       ; %bb.0: ; %entry
150; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
151; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
152; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
153; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
154; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
155; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
156; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
157; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
158; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
159; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
160; GFX12-WGP-NEXT:    s_endpgm
161;
162; GFX12-CU-LABEL: flat_system_unordered_load:
163; GFX12-CU:       ; %bb.0: ; %entry
164; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
165; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
166; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
167; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
168; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
169; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
170; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
171; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
172; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
173; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
174; GFX12-CU-NEXT:    s_endpgm
175    ptr %in, ptr %out) {
176entry:
177  %val = load atomic i32, ptr %in unordered, align 4
178  store i32 %val, ptr %out
179  ret void
180}
181
182define amdgpu_kernel void @flat_system_monotonic_load(
183; GFX7-LABEL: flat_system_monotonic_load:
184; GFX7:       ; %bb.0: ; %entry
185; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
186; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX7-NEXT:    v_mov_b32_e32 v0, s6
189; GFX7-NEXT:    v_mov_b32_e32 v1, s7
190; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
191; GFX7-NEXT:    v_mov_b32_e32 v0, s4
192; GFX7-NEXT:    v_mov_b32_e32 v1, s5
193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
194; GFX7-NEXT:    flat_store_dword v[0:1], v2
195; GFX7-NEXT:    s_endpgm
196;
197; GFX10-WGP-LABEL: flat_system_monotonic_load:
198; GFX10-WGP:       ; %bb.0: ; %entry
199; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
200; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
201; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
202; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
203; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
204; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
205; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
206; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
207; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
208; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
209; GFX10-WGP-NEXT:    s_endpgm
210;
211; GFX10-CU-LABEL: flat_system_monotonic_load:
212; GFX10-CU:       ; %bb.0: ; %entry
213; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
214; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
215; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
217; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
218; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
219; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
220; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
221; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
222; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
223; GFX10-CU-NEXT:    s_endpgm
224;
225; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load:
226; SKIP-CACHE-INV:       ; %bb.0: ; %entry
227; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
229; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
232; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
235; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
236; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
237; SKIP-CACHE-INV-NEXT:    s_endpgm
238;
239; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
240; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
241; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
242; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
243; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
245; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
246; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
247; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
248; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
249; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
250;
251; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
252; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
253; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
254; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
257; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
259; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
260; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
261; GFX90A-TGSPLIT-NEXT:    s_endpgm
262;
263; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
264; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
265; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
266; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
267; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
269; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
271; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
273; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
274;
275; GFX940-TGSPLIT-LABEL: flat_system_monotonic_load:
276; GFX940-TGSPLIT:       ; %bb.0: ; %entry
277; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
278; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
279; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
281; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
283; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
284; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
285; GFX940-TGSPLIT-NEXT:    s_endpgm
286;
287; GFX11-WGP-LABEL: flat_system_monotonic_load:
288; GFX11-WGP:       ; %bb.0: ; %entry
289; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
290; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
291; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
293; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
294; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
295; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
296; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
297; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
299; GFX11-WGP-NEXT:    s_endpgm
300;
301; GFX11-CU-LABEL: flat_system_monotonic_load:
302; GFX11-CU:       ; %bb.0: ; %entry
303; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
304; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
305; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
307; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
308; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
309; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
310; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
311; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
312; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
313; GFX11-CU-NEXT:    s_endpgm
314;
315; GFX12-WGP-LABEL: flat_system_monotonic_load:
316; GFX12-WGP:       ; %bb.0: ; %entry
317; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
318; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
319; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
320; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
321; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
322; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
323; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
324; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
325; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
326; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
327; GFX12-WGP-NEXT:    s_endpgm
328;
329; GFX12-CU-LABEL: flat_system_monotonic_load:
330; GFX12-CU:       ; %bb.0: ; %entry
331; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
332; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
333; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
334; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
335; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
336; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
337; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
338; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
339; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
340; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
341; GFX12-CU-NEXT:    s_endpgm
342    ptr %in, ptr %out) {
343entry:
344  %val = load atomic i32, ptr %in monotonic, align 4
345  store i32 %val, ptr %out
346  ret void
347}
348
349define amdgpu_kernel void @flat_system_acquire_load(
350; GFX7-LABEL: flat_system_acquire_load:
351; GFX7:       ; %bb.0: ; %entry
352; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
353; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    v_mov_b32_e32 v0, s6
356; GFX7-NEXT:    v_mov_b32_e32 v1, s7
357; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
358; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
359; GFX7-NEXT:    buffer_wbinvl1_vol
360; GFX7-NEXT:    v_mov_b32_e32 v0, s4
361; GFX7-NEXT:    v_mov_b32_e32 v1, s5
362; GFX7-NEXT:    flat_store_dword v[0:1], v2
363; GFX7-NEXT:    s_endpgm
364;
365; GFX10-WGP-LABEL: flat_system_acquire_load:
366; GFX10-WGP:       ; %bb.0: ; %entry
367; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
368; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
369; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
371; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
372; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
373; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
374; GFX10-WGP-NEXT:    buffer_gl1_inv
375; GFX10-WGP-NEXT:    buffer_gl0_inv
376; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
377; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
378; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
379; GFX10-WGP-NEXT:    s_endpgm
380;
381; GFX10-CU-LABEL: flat_system_acquire_load:
382; GFX10-CU:       ; %bb.0: ; %entry
383; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
384; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
385; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
388; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
389; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
390; GFX10-CU-NEXT:    buffer_gl1_inv
391; GFX10-CU-NEXT:    buffer_gl0_inv
392; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
393; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
394; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
395; GFX10-CU-NEXT:    s_endpgm
396;
397; SKIP-CACHE-INV-LABEL: flat_system_acquire_load:
398; SKIP-CACHE-INV:       ; %bb.0: ; %entry
399; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
400; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
401; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
402; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
403; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
404; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
405; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
406; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
408; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
409; SKIP-CACHE-INV-NEXT:    s_endpgm
410;
411; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
412; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
413; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
414; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
415; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
417; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
418; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
419; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
420; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
421; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
422; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
423; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
424;
425; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
426; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
427; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
428; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
429; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
431; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
432; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
433; GFX90A-TGSPLIT-NEXT:    buffer_invl2
434; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
435; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
436; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
437; GFX90A-TGSPLIT-NEXT:    s_endpgm
438;
439; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_load:
440; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
441; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
442; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
443; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
445; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
446; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
447; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
448; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
449; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
450; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
451;
452; GFX940-TGSPLIT-LABEL: flat_system_acquire_load:
453; GFX940-TGSPLIT:       ; %bb.0: ; %entry
454; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
455; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
456; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
458; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
459; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
460; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
461; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
462; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
463; GFX940-TGSPLIT-NEXT:    s_endpgm
464;
465; GFX11-WGP-LABEL: flat_system_acquire_load:
466; GFX11-WGP:       ; %bb.0: ; %entry
467; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
468; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
469; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
471; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
472; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
473; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
474; GFX11-WGP-NEXT:    buffer_gl1_inv
475; GFX11-WGP-NEXT:    buffer_gl0_inv
476; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
477; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
478; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
479; GFX11-WGP-NEXT:    s_endpgm
480;
481; GFX11-CU-LABEL: flat_system_acquire_load:
482; GFX11-CU:       ; %bb.0: ; %entry
483; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
484; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
485; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
487; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
488; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
489; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
490; GFX11-CU-NEXT:    buffer_gl1_inv
491; GFX11-CU-NEXT:    buffer_gl0_inv
492; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
493; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
494; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
495; GFX11-CU-NEXT:    s_endpgm
496;
497; GFX12-WGP-LABEL: flat_system_acquire_load:
498; GFX12-WGP:       ; %bb.0: ; %entry
499; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
500; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
501; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
502; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
503; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
504; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
505; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
506; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
507; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
508; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
509; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
510; GFX12-WGP-NEXT:    s_endpgm
511;
512; GFX12-CU-LABEL: flat_system_acquire_load:
513; GFX12-CU:       ; %bb.0: ; %entry
514; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
515; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
516; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
517; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
518; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
519; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
520; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
521; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
522; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
523; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
524; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
525; GFX12-CU-NEXT:    s_endpgm
526    ptr %in, ptr %out) {
527entry:
528  %val = load atomic i32, ptr %in acquire, align 4
529  store i32 %val, ptr %out
530  ret void
531}
532
533define amdgpu_kernel void @flat_system_seq_cst_load(
534; GFX7-LABEL: flat_system_seq_cst_load:
535; GFX7:       ; %bb.0: ; %entry
536; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
537; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
538; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX7-NEXT:    v_mov_b32_e32 v0, s6
540; GFX7-NEXT:    v_mov_b32_e32 v1, s7
541; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
542; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
543; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
544; GFX7-NEXT:    buffer_wbinvl1_vol
545; GFX7-NEXT:    v_mov_b32_e32 v0, s4
546; GFX7-NEXT:    v_mov_b32_e32 v1, s5
547; GFX7-NEXT:    flat_store_dword v[0:1], v2
548; GFX7-NEXT:    s_endpgm
549;
550; GFX10-WGP-LABEL: flat_system_seq_cst_load:
551; GFX10-WGP:       ; %bb.0: ; %entry
552; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
553; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
554; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
556; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
557; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
558; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
559; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
560; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
561; GFX10-WGP-NEXT:    buffer_gl1_inv
562; GFX10-WGP-NEXT:    buffer_gl0_inv
563; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
564; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
565; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
566; GFX10-WGP-NEXT:    s_endpgm
567;
568; GFX10-CU-LABEL: flat_system_seq_cst_load:
569; GFX10-CU:       ; %bb.0: ; %entry
570; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
571; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
572; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
574; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
575; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
576; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
577; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
578; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
579; GFX10-CU-NEXT:    buffer_gl1_inv
580; GFX10-CU-NEXT:    buffer_gl0_inv
581; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
582; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
583; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
584; GFX10-CU-NEXT:    s_endpgm
585;
586; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load:
587; SKIP-CACHE-INV:       ; %bb.0: ; %entry
588; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
589; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
590; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
592; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
593; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
594; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
595; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
598; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
599; SKIP-CACHE-INV-NEXT:    s_endpgm
600;
601; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
602; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
603; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
604; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
605; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
606; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
607; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
608; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
609; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
610; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
611; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
612; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
613; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
614; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
615;
616; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
617; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
618; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
619; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
620; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
622; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
623; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
624; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
625; GFX90A-TGSPLIT-NEXT:    buffer_invl2
626; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
627; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
628; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
629; GFX90A-TGSPLIT-NEXT:    s_endpgm
630;
631; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
632; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
633; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
634; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
635; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
637; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
638; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
639; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
640; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
641; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
642; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
643; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
644;
645; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_load:
646; GFX940-TGSPLIT:       ; %bb.0: ; %entry
647; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
648; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
649; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
651; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
652; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
653; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
654; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
655; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
656; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
657; GFX940-TGSPLIT-NEXT:    s_endpgm
658;
659; GFX11-WGP-LABEL: flat_system_seq_cst_load:
660; GFX11-WGP:       ; %bb.0: ; %entry
661; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
662; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
663; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
665; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
666; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
667; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
668; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
669; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
670; GFX11-WGP-NEXT:    buffer_gl1_inv
671; GFX11-WGP-NEXT:    buffer_gl0_inv
672; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
673; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
674; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
675; GFX11-WGP-NEXT:    s_endpgm
676;
677; GFX11-CU-LABEL: flat_system_seq_cst_load:
678; GFX11-CU:       ; %bb.0: ; %entry
679; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
680; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
681; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
683; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
684; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
685; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
686; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
687; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
688; GFX11-CU-NEXT:    buffer_gl1_inv
689; GFX11-CU-NEXT:    buffer_gl0_inv
690; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
691; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
692; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
693; GFX11-CU-NEXT:    s_endpgm
694;
695; GFX12-WGP-LABEL: flat_system_seq_cst_load:
696; GFX12-WGP:       ; %bb.0: ; %entry
697; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
698; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
699; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
700; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
701; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
702; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
703; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
704; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
705; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
706; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
707; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
708; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
709; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
710; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
711; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
712; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
713; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
714; GFX12-WGP-NEXT:    s_endpgm
715;
716; GFX12-CU-LABEL: flat_system_seq_cst_load:
717; GFX12-CU:       ; %bb.0: ; %entry
718; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
719; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
720; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
721; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
722; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
723; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
724; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
725; GFX12-CU-NEXT:    s_wait_storecnt 0x0
726; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
727; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
728; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
729; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
730; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
731; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
732; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
733; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
734; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
735; GFX12-CU-NEXT:    s_endpgm
736    ptr %in, ptr %out) {
737entry:
738  %val = load atomic i32, ptr %in seq_cst, align 4
739  store i32 %val, ptr %out
740  ret void
741}
742
743define amdgpu_kernel void @flat_system_unordered_store(
744; GFX7-LABEL: flat_system_unordered_store:
745; GFX7:       ; %bb.0: ; %entry
746; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
747; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
748; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX7-NEXT:    v_mov_b32_e32 v0, s6
750; GFX7-NEXT:    v_mov_b32_e32 v1, s7
751; GFX7-NEXT:    v_mov_b32_e32 v2, s4
752; GFX7-NEXT:    flat_store_dword v[0:1], v2
753; GFX7-NEXT:    s_endpgm
754;
755; GFX10-WGP-LABEL: flat_system_unordered_store:
756; GFX10-WGP:       ; %bb.0: ; %entry
757; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
758; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
759; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
761; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
762; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
763; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
764; GFX10-WGP-NEXT:    s_endpgm
765;
766; GFX10-CU-LABEL: flat_system_unordered_store:
767; GFX10-CU:       ; %bb.0: ; %entry
768; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
769; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
770; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
772; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
773; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
774; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
775; GFX10-CU-NEXT:    s_endpgm
776;
777; SKIP-CACHE-INV-LABEL: flat_system_unordered_store:
778; SKIP-CACHE-INV:       ; %bb.0: ; %entry
779; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
780; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
781; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
785; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
786; SKIP-CACHE-INV-NEXT:    s_endpgm
787;
788; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
789; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
790; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
791; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
792; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
794; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
795; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
796; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
797;
798; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
799; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
800; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
801; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
802; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
804; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
805; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
806; GFX90A-TGSPLIT-NEXT:    s_endpgm
807;
808; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_store:
809; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
810; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
811; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
812; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
813; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
814; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
815; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
816; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
817;
818; GFX940-TGSPLIT-LABEL: flat_system_unordered_store:
819; GFX940-TGSPLIT:       ; %bb.0: ; %entry
820; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
821; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
822; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
824; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
825; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
826; GFX940-TGSPLIT-NEXT:    s_endpgm
827;
828; GFX11-WGP-LABEL: flat_system_unordered_store:
829; GFX11-WGP:       ; %bb.0: ; %entry
830; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
831; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
832; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
834; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
835; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
836; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
837; GFX11-WGP-NEXT:    s_endpgm
838;
839; GFX11-CU-LABEL: flat_system_unordered_store:
840; GFX11-CU:       ; %bb.0: ; %entry
841; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
842; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
843; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
845; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
846; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
847; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
848; GFX11-CU-NEXT:    s_endpgm
849;
850; GFX12-WGP-LABEL: flat_system_unordered_store:
851; GFX12-WGP:       ; %bb.0: ; %entry
852; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
853; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
854; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
855; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
856; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
857; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
858; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
859; GFX12-WGP-NEXT:    s_endpgm
860;
861; GFX12-CU-LABEL: flat_system_unordered_store:
862; GFX12-CU:       ; %bb.0: ; %entry
863; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
864; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
865; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
866; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
867; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
868; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
869; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
870; GFX12-CU-NEXT:    s_endpgm
871    i32 %in, ptr %out) {
872entry:
873  store atomic i32 %in, ptr %out unordered, align 4
874  ret void
875}
876
877define amdgpu_kernel void @flat_system_monotonic_store(
878; GFX7-LABEL: flat_system_monotonic_store:
879; GFX7:       ; %bb.0: ; %entry
880; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
881; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
882; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX7-NEXT:    v_mov_b32_e32 v0, s6
884; GFX7-NEXT:    v_mov_b32_e32 v1, s7
885; GFX7-NEXT:    v_mov_b32_e32 v2, s4
886; GFX7-NEXT:    flat_store_dword v[0:1], v2
887; GFX7-NEXT:    s_endpgm
888;
889; GFX10-WGP-LABEL: flat_system_monotonic_store:
890; GFX10-WGP:       ; %bb.0: ; %entry
891; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
892; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
893; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
894; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
895; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
896; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
897; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
898; GFX10-WGP-NEXT:    s_endpgm
899;
900; GFX10-CU-LABEL: flat_system_monotonic_store:
901; GFX10-CU:       ; %bb.0: ; %entry
902; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
903; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
904; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
906; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
907; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
908; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
909; GFX10-CU-NEXT:    s_endpgm
910;
911; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store:
912; SKIP-CACHE-INV:       ; %bb.0: ; %entry
913; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
914; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
915; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
919; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
920; SKIP-CACHE-INV-NEXT:    s_endpgm
921;
922; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
923; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
924; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
925; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
926; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
928; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
929; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
930; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
931;
932; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
933; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
934; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
935; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
936; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
938; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
939; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
940; GFX90A-TGSPLIT-NEXT:    s_endpgm
941;
942; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
943; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
944; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
945; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
946; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
948; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
949; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
950; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
951;
952; GFX940-TGSPLIT-LABEL: flat_system_monotonic_store:
953; GFX940-TGSPLIT:       ; %bb.0: ; %entry
954; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
955; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
956; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
957; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
958; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
959; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
960; GFX940-TGSPLIT-NEXT:    s_endpgm
961;
962; GFX11-WGP-LABEL: flat_system_monotonic_store:
963; GFX11-WGP:       ; %bb.0: ; %entry
964; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
965; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
966; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
968; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
969; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
970; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
971; GFX11-WGP-NEXT:    s_endpgm
972;
973; GFX11-CU-LABEL: flat_system_monotonic_store:
974; GFX11-CU:       ; %bb.0: ; %entry
975; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
976; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
977; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
979; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
980; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
981; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
982; GFX11-CU-NEXT:    s_endpgm
983;
984; GFX12-WGP-LABEL: flat_system_monotonic_store:
985; GFX12-WGP:       ; %bb.0: ; %entry
986; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
987; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
988; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
989; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
990; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
991; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
992; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
993; GFX12-WGP-NEXT:    s_endpgm
994;
995; GFX12-CU-LABEL: flat_system_monotonic_store:
996; GFX12-CU:       ; %bb.0: ; %entry
997; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
998; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
999; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1000; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1001; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1002; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1003; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
1004; GFX12-CU-NEXT:    s_endpgm
1005    i32 %in, ptr %out) {
1006entry:
1007  store atomic i32 %in, ptr %out monotonic, align 4
1008  ret void
1009}
1010
1011define amdgpu_kernel void @flat_system_release_store(
1012; GFX7-LABEL: flat_system_release_store:
1013; GFX7:       ; %bb.0: ; %entry
1014; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1015; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1016; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1018; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1019; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1020; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1021; GFX7-NEXT:    flat_store_dword v[0:1], v2
1022; GFX7-NEXT:    s_endpgm
1023;
1024; GFX10-WGP-LABEL: flat_system_release_store:
1025; GFX10-WGP:       ; %bb.0: ; %entry
1026; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
1027; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1028; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1030; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1031; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1032; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1033; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1034; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1035; GFX10-WGP-NEXT:    s_endpgm
1036;
1037; GFX10-CU-LABEL: flat_system_release_store:
1038; GFX10-CU:       ; %bb.0: ; %entry
1039; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
1040; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1041; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1042; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1043; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1044; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1045; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1046; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1047; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1048; GFX10-CU-NEXT:    s_endpgm
1049;
1050; SKIP-CACHE-INV-LABEL: flat_system_release_store:
1051; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1052; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
1053; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1054; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1056; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1058; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1059; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1060; SKIP-CACHE-INV-NEXT:    s_endpgm
1061;
1062; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
1063; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1064; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1065; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1066; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1068; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1069; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1070; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1071; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1072; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1073;
1074; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
1075; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1076; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1077; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1078; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1079; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1081; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1082; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1083; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1084; GFX90A-TGSPLIT-NEXT:    s_endpgm
1085;
1086; GFX940-NOTTGSPLIT-LABEL: flat_system_release_store:
1087; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1088; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1089; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1090; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1092; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1093; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1094; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1095; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1096; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1097;
1098; GFX940-TGSPLIT-LABEL: flat_system_release_store:
1099; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1100; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1101; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1102; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1104; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1105; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1106; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1107; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1108; GFX940-TGSPLIT-NEXT:    s_endpgm
1109;
1110; GFX11-WGP-LABEL: flat_system_release_store:
1111; GFX11-WGP:       ; %bb.0: ; %entry
1112; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1113; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1114; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1116; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1117; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1118; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1119; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1120; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1121; GFX11-WGP-NEXT:    s_endpgm
1122;
1123; GFX11-CU-LABEL: flat_system_release_store:
1124; GFX11-CU:       ; %bb.0: ; %entry
1125; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1126; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1127; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1129; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1130; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1131; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1132; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1133; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1134; GFX11-CU-NEXT:    s_endpgm
1135;
1136; GFX12-WGP-LABEL: flat_system_release_store:
1137; GFX12-WGP:       ; %bb.0: ; %entry
1138; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1139; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1140; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1141; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1142; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1143; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1144; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
1145; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1146; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1147; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1148; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1149; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
1150; GFX12-WGP-NEXT:    s_endpgm
1151;
1152; GFX12-CU-LABEL: flat_system_release_store:
1153; GFX12-CU:       ; %bb.0: ; %entry
1154; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1155; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1156; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1157; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1158; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1159; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1160; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
1161; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
1162; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
1163; GFX12-CU-NEXT:    s_wait_storecnt 0x0
1164; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
1165; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
1166; GFX12-CU-NEXT:    s_endpgm
1167    i32 %in, ptr %out) {
1168entry:
1169  store atomic i32 %in, ptr %out release, align 4
1170  ret void
1171}
1172
1173define amdgpu_kernel void @flat_system_seq_cst_store(
1174; GFX7-LABEL: flat_system_seq_cst_store:
1175; GFX7:       ; %bb.0: ; %entry
1176; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1177; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1178; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1180; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1181; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1182; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1183; GFX7-NEXT:    flat_store_dword v[0:1], v2
1184; GFX7-NEXT:    s_endpgm
1185;
1186; GFX10-WGP-LABEL: flat_system_seq_cst_store:
1187; GFX10-WGP:       ; %bb.0: ; %entry
1188; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
1189; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1190; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1191; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1192; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1193; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1194; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1195; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1196; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1197; GFX10-WGP-NEXT:    s_endpgm
1198;
1199; GFX10-CU-LABEL: flat_system_seq_cst_store:
1200; GFX10-CU:       ; %bb.0: ; %entry
1201; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
1202; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1203; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1205; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1206; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1207; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1208; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1209; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1210; GFX10-CU-NEXT:    s_endpgm
1211;
1212; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store:
1213; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1214; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
1215; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1216; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1217; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1218; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1219; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1220; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1221; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1222; SKIP-CACHE-INV-NEXT:    s_endpgm
1223;
1224; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
1225; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1226; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1227; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1228; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1230; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1231; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1232; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1233; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1234; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1235;
1236; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
1237; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1238; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1239; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1240; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1241; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1242; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1243; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1244; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1245; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1246; GFX90A-TGSPLIT-NEXT:    s_endpgm
1247;
1248; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
1249; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1250; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1251; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1252; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1253; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1254; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1255; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1256; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1257; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1258; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1259;
1260; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_store:
1261; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1262; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1263; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1264; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1265; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1266; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1267; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1268; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1269; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1270; GFX940-TGSPLIT-NEXT:    s_endpgm
1271;
1272; GFX11-WGP-LABEL: flat_system_seq_cst_store:
1273; GFX11-WGP:       ; %bb.0: ; %entry
1274; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1275; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1276; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1278; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1279; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1280; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1281; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1282; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1283; GFX11-WGP-NEXT:    s_endpgm
1284;
1285; GFX11-CU-LABEL: flat_system_seq_cst_store:
1286; GFX11-CU:       ; %bb.0: ; %entry
1287; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1288; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1289; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1290; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1291; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1292; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1293; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1294; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1295; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1296; GFX11-CU-NEXT:    s_endpgm
1297;
1298; GFX12-WGP-LABEL: flat_system_seq_cst_store:
1299; GFX12-WGP:       ; %bb.0: ; %entry
1300; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1301; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1302; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1303; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1304; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1305; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1306; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
1307; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1308; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1309; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1310; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1311; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
1312; GFX12-WGP-NEXT:    s_endpgm
1313;
1314; GFX12-CU-LABEL: flat_system_seq_cst_store:
1315; GFX12-CU:       ; %bb.0: ; %entry
1316; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1317; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1318; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1319; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1320; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1321; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1322; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
1323; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
1324; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
1325; GFX12-CU-NEXT:    s_wait_storecnt 0x0
1326; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
1327; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
1328; GFX12-CU-NEXT:    s_endpgm
1329    i32 %in, ptr %out) {
1330entry:
1331  store atomic i32 %in, ptr %out seq_cst, align 4
1332  ret void
1333}
1334
1335define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
1336; GFX7-LABEL: flat_system_monotonic_atomicrmw:
1337; GFX7:       ; %bb.0: ; %entry
1338; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1339; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1340; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1342; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1343; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1344; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1345; GFX7-NEXT:    s_endpgm
1346;
1347; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
1348; GFX10-WGP:       ; %bb.0: ; %entry
1349; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1350; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1351; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1352; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1353; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1354; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1355; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1356; GFX10-WGP-NEXT:    s_endpgm
1357;
1358; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
1359; GFX10-CU:       ; %bb.0: ; %entry
1360; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1361; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1362; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1363; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1364; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1365; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1366; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1367; GFX10-CU-NEXT:    s_endpgm
1368;
1369; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw:
1370; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1371; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1372; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1373; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1374; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1377; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1378; SKIP-CACHE-INV-NEXT:    s_endpgm
1379;
1380; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
1381; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1382; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1383; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1384; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1386; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1387; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1388; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1389;
1390; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
1391; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1392; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1393; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1394; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1396; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1397; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1398; GFX90A-TGSPLIT-NEXT:    s_endpgm
1399;
1400; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
1401; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1402; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1403; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1404; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1406; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1407; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1408; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1409;
1410; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
1411; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1412; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1413; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1414; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1415; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1416; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1417; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1418; GFX940-TGSPLIT-NEXT:    s_endpgm
1419;
1420; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw:
1421; GFX11-WGP:       ; %bb.0: ; %entry
1422; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1423; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1424; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1425; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1426; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1427; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1428; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1429; GFX11-WGP-NEXT:    s_endpgm
1430;
1431; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw:
1432; GFX11-CU:       ; %bb.0: ; %entry
1433; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1434; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1435; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1437; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1438; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1439; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1440; GFX11-CU-NEXT:    s_endpgm
1441;
1442; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw:
1443; GFX12-WGP:       ; %bb.0: ; %entry
1444; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1445; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1446; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1447; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1448; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1449; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1450; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1451; GFX12-WGP-NEXT:    s_endpgm
1452;
1453; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw:
1454; GFX12-CU:       ; %bb.0: ; %entry
1455; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1456; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1457; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1458; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1459; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1460; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1461; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1462; GFX12-CU-NEXT:    s_endpgm
1463    ptr %out, i32 %in) {
1464entry:
1465  %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
1466  ret void
1467}
1468
1469define amdgpu_kernel void @flat_system_acquire_atomicrmw(
1470; GFX7-LABEL: flat_system_acquire_atomicrmw:
1471; GFX7:       ; %bb.0: ; %entry
1472; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1473; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1474; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1475; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1476; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1477; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1478; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1479; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1480; GFX7-NEXT:    buffer_wbinvl1_vol
1481; GFX7-NEXT:    s_endpgm
1482;
1483; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
1484; GFX10-WGP:       ; %bb.0: ; %entry
1485; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1486; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1487; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1489; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1490; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1491; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1492; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1493; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1494; GFX10-WGP-NEXT:    buffer_gl1_inv
1495; GFX10-WGP-NEXT:    buffer_gl0_inv
1496; GFX10-WGP-NEXT:    s_endpgm
1497;
1498; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
1499; GFX10-CU:       ; %bb.0: ; %entry
1500; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1501; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1502; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1504; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1505; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1506; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1507; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1509; GFX10-CU-NEXT:    buffer_gl1_inv
1510; GFX10-CU-NEXT:    buffer_gl0_inv
1511; GFX10-CU-NEXT:    s_endpgm
1512;
1513; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw:
1514; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1515; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1516; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1517; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1518; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1519; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1520; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1521; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1522; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1523; SKIP-CACHE-INV-NEXT:    s_endpgm
1524;
1525; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
1526; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1527; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1528; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1529; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1530; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1531; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1532; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1533; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1534; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1535; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1536; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1537;
1538; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
1539; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1540; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1541; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1542; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1544; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1545; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1546; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1547; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1548; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1549; GFX90A-TGSPLIT-NEXT:    s_endpgm
1550;
1551; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
1552; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1553; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1554; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1555; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1557; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1558; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1559; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1560; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
1561; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1562;
1563; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
1564; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1565; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1566; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1567; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1569; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1570; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1571; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1572; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
1573; GFX940-TGSPLIT-NEXT:    s_endpgm
1574;
1575; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw:
1576; GFX11-WGP:       ; %bb.0: ; %entry
1577; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1578; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1579; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1581; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1582; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1583; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1584; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1586; GFX11-WGP-NEXT:    buffer_gl1_inv
1587; GFX11-WGP-NEXT:    buffer_gl0_inv
1588; GFX11-WGP-NEXT:    s_endpgm
1589;
1590; GFX11-CU-LABEL: flat_system_acquire_atomicrmw:
1591; GFX11-CU:       ; %bb.0: ; %entry
1592; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1593; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1594; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1596; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1597; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1598; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1599; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1600; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1601; GFX11-CU-NEXT:    buffer_gl1_inv
1602; GFX11-CU-NEXT:    buffer_gl0_inv
1603; GFX11-CU-NEXT:    s_endpgm
1604;
1605; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw:
1606; GFX12-WGP:       ; %bb.0: ; %entry
1607; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1608; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1609; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1610; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1611; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1612; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1613; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1614; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
1615; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
1616; GFX12-WGP-NEXT:    s_endpgm
1617;
1618; GFX12-CU-LABEL: flat_system_acquire_atomicrmw:
1619; GFX12-CU:       ; %bb.0: ; %entry
1620; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1621; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1622; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1623; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1624; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1625; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1626; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1627; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
1628; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
1629; GFX12-CU-NEXT:    s_endpgm
1630    ptr %out, i32 %in) {
1631entry:
1632  %val = atomicrmw volatile xchg ptr %out, i32 %in acquire
1633  ret void
1634}
1635
1636define amdgpu_kernel void @flat_system_release_atomicrmw(
1637; GFX7-LABEL: flat_system_release_atomicrmw:
1638; GFX7:       ; %bb.0: ; %entry
1639; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1640; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1641; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1642; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1643; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1644; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1645; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1646; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1647; GFX7-NEXT:    s_endpgm
1648;
1649; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
1650; GFX10-WGP:       ; %bb.0: ; %entry
1651; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1652; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1653; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1654; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1655; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1656; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1657; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1658; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1659; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1660; GFX10-WGP-NEXT:    s_endpgm
1661;
1662; GFX10-CU-LABEL: flat_system_release_atomicrmw:
1663; GFX10-CU:       ; %bb.0: ; %entry
1664; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1665; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1666; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1668; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1669; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1670; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1671; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1672; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1673; GFX10-CU-NEXT:    s_endpgm
1674;
1675; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw:
1676; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1677; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1678; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1679; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1681; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1683; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1684; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1685; SKIP-CACHE-INV-NEXT:    s_endpgm
1686;
1687; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
1688; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1689; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1690; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1691; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1692; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1693; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1694; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1695; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1696; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1697; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1698;
1699; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
1700; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1701; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1702; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1703; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1704; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1705; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1706; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1707; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1708; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1709; GFX90A-TGSPLIT-NEXT:    s_endpgm
1710;
1711; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
1712; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1713; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1714; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1715; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1716; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1717; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1718; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1719; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1720; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1721; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1722;
1723; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw:
1724; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1725; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1726; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1727; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1729; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1730; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1731; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1732; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1733; GFX940-TGSPLIT-NEXT:    s_endpgm
1734;
1735; GFX11-WGP-LABEL: flat_system_release_atomicrmw:
1736; GFX11-WGP:       ; %bb.0: ; %entry
1737; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1738; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1739; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1741; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1742; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1743; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1744; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1745; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1746; GFX11-WGP-NEXT:    s_endpgm
1747;
1748; GFX11-CU-LABEL: flat_system_release_atomicrmw:
1749; GFX11-CU:       ; %bb.0: ; %entry
1750; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1751; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1752; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1754; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1755; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1756; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1757; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1758; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1759; GFX11-CU-NEXT:    s_endpgm
1760;
1761; GFX12-WGP-LABEL: flat_system_release_atomicrmw:
1762; GFX12-WGP:       ; %bb.0: ; %entry
1763; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1764; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1765; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1766; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1767; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1768; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1769; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
1770; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1771; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1772; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1773; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1774; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1775; GFX12-WGP-NEXT:    s_endpgm
1776;
1777; GFX12-CU-LABEL: flat_system_release_atomicrmw:
1778; GFX12-CU:       ; %bb.0: ; %entry
1779; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1780; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1781; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1782; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1783; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1784; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1785; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
1786; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
1787; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
1788; GFX12-CU-NEXT:    s_wait_storecnt 0x0
1789; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
1790; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1791; GFX12-CU-NEXT:    s_endpgm
1792    ptr %out, i32 %in) {
1793entry:
1794  %val = atomicrmw volatile xchg ptr %out, i32 %in release
1795  ret void
1796}
1797
1798define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
1799; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
1800; GFX7:       ; %bb.0: ; %entry
1801; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1802; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1803; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1805; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1806; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1807; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1808; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1809; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1810; GFX7-NEXT:    buffer_wbinvl1_vol
1811; GFX7-NEXT:    s_endpgm
1812;
1813; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
1814; GFX10-WGP:       ; %bb.0: ; %entry
1815; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1816; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1817; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1819; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1820; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1821; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1822; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1823; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1824; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1826; GFX10-WGP-NEXT:    buffer_gl1_inv
1827; GFX10-WGP-NEXT:    buffer_gl0_inv
1828; GFX10-WGP-NEXT:    s_endpgm
1829;
1830; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
1831; GFX10-CU:       ; %bb.0: ; %entry
1832; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1833; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1834; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1836; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1837; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1838; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1839; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1840; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1841; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1843; GFX10-CU-NEXT:    buffer_gl1_inv
1844; GFX10-CU-NEXT:    buffer_gl0_inv
1845; GFX10-CU-NEXT:    s_endpgm
1846;
1847; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw:
1848; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1849; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1850; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1851; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1852; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1853; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1854; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1855; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1856; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1857; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1858; SKIP-CACHE-INV-NEXT:    s_endpgm
1859;
1860; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
1861; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1862; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1863; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1864; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1866; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1867; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
1868; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1869; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1870; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1871; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
1872; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
1873; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1874;
1875; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
1876; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1877; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1878; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1879; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1880; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1881; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1882; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
1883; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1884; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1885; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1886; GFX90A-TGSPLIT-NEXT:    buffer_invl2
1887; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
1888; GFX90A-TGSPLIT-NEXT:    s_endpgm
1889;
1890; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
1891; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1892; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1893; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1894; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1895; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1896; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1897; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1898; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1899; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1900; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1901; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
1902; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1903;
1904; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
1905; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1906; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1907; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1908; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1909; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1910; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1911; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
1912; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1913; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
1914; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1915; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
1916; GFX940-TGSPLIT-NEXT:    s_endpgm
1917;
1918; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw:
1919; GFX11-WGP:       ; %bb.0: ; %entry
1920; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1921; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1922; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1924; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1925; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1926; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1927; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1928; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1929; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1930; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1931; GFX11-WGP-NEXT:    buffer_gl1_inv
1932; GFX11-WGP-NEXT:    buffer_gl0_inv
1933; GFX11-WGP-NEXT:    s_endpgm
1934;
1935; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw:
1936; GFX11-CU:       ; %bb.0: ; %entry
1937; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1938; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1939; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1940; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1941; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1942; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1943; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1944; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1945; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1946; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1947; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1948; GFX11-CU-NEXT:    buffer_gl1_inv
1949; GFX11-CU-NEXT:    buffer_gl0_inv
1950; GFX11-CU-NEXT:    s_endpgm
1951;
1952; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw:
1953; GFX12-WGP:       ; %bb.0: ; %entry
1954; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1955; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1956; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1957; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1958; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1959; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1960; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
1961; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1962; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1963; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
1964; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
1965; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1966; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
1967; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
1968; GFX12-WGP-NEXT:    s_endpgm
1969;
1970; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw:
1971; GFX12-CU:       ; %bb.0: ; %entry
1972; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1973; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1974; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1975; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1976; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1977; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1978; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
1979; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
1980; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
1981; GFX12-CU-NEXT:    s_wait_storecnt 0x0
1982; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
1983; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
1984; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
1985; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
1986; GFX12-CU-NEXT:    s_endpgm
1987    ptr %out, i32 %in) {
1988entry:
1989  %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
1990  ret void
1991}
1992
1993define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
1994; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
1995; GFX7:       ; %bb.0: ; %entry
1996; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1997; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1998; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1999; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2000; GFX7-NEXT:    v_mov_b32_e32 v1, s7
2001; GFX7-NEXT:    v_mov_b32_e32 v2, s4
2002; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2003; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
2004; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2005; GFX7-NEXT:    buffer_wbinvl1_vol
2006; GFX7-NEXT:    s_endpgm
2007;
2008; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
2009; GFX10-WGP:       ; %bb.0: ; %entry
2010; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
2011; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
2012; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2013; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2014; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
2015; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
2016; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2017; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2018; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
2019; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2021; GFX10-WGP-NEXT:    buffer_gl1_inv
2022; GFX10-WGP-NEXT:    buffer_gl0_inv
2023; GFX10-WGP-NEXT:    s_endpgm
2024;
2025; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
2026; GFX10-CU:       ; %bb.0: ; %entry
2027; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
2028; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
2029; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2031; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
2032; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
2033; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2034; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2035; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
2036; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2038; GFX10-CU-NEXT:    buffer_gl1_inv
2039; GFX10-CU-NEXT:    buffer_gl0_inv
2040; GFX10-CU-NEXT:    s_endpgm
2041;
2042; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw:
2043; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2044; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2045; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
2046; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2047; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2048; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2050; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2051; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
2052; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2053; SKIP-CACHE-INV-NEXT:    s_endpgm
2054;
2055; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
2056; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2057; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
2058; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
2059; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2060; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
2061; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
2062; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2063; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2064; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
2065; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2066; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2067; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2068; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2069;
2070; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
2071; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2072; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
2073; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
2074; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2075; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
2076; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
2077; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2078; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2079; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
2080; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2081; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2082; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2083; GFX90A-TGSPLIT-NEXT:    s_endpgm
2084;
2085; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
2086; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2087; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2088; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
2089; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2090; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2091; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
2092; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2093; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2094; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
2095; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2096; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
2097; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2098;
2099; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
2100; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2101; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2102; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
2103; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2104; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
2105; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
2106; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2107; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2108; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
2109; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2110; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
2111; GFX940-TGSPLIT-NEXT:    s_endpgm
2112;
2113; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw:
2114; GFX11-WGP:       ; %bb.0: ; %entry
2115; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2116; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
2117; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
2119; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
2120; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
2121; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2122; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2123; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
2124; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2125; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2126; GFX11-WGP-NEXT:    buffer_gl1_inv
2127; GFX11-WGP-NEXT:    buffer_gl0_inv
2128; GFX11-WGP-NEXT:    s_endpgm
2129;
2130; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw:
2131; GFX11-CU:       ; %bb.0: ; %entry
2132; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2133; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
2134; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2135; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
2136; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
2137; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
2138; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2139; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2140; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
2141; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2143; GFX11-CU-NEXT:    buffer_gl1_inv
2144; GFX11-CU-NEXT:    buffer_gl0_inv
2145; GFX11-CU-NEXT:    s_endpgm
2146;
2147; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw:
2148; GFX12-WGP:       ; %bb.0: ; %entry
2149; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2150; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
2151; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2152; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
2153; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
2154; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
2155; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
2156; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2157; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2158; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2159; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2160; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
2161; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
2162; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
2163; GFX12-WGP-NEXT:    s_endpgm
2164;
2165; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw:
2166; GFX12-CU:       ; %bb.0: ; %entry
2167; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
2168; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
2169; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2170; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
2171; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
2172; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
2173; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
2174; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
2175; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
2176; GFX12-CU-NEXT:    s_wait_storecnt 0x0
2177; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2178; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
2179; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
2180; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
2181; GFX12-CU-NEXT:    s_endpgm
2182    ptr %out, i32 %in) {
2183entry:
2184  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
2185  ret void
2186}
2187
2188define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
2189; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
2190; GFX7:       ; %bb.0: ; %entry
2191; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2192; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2193; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2194; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2195; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2196; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2197; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2198; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2199; GFX7-NEXT:    buffer_wbinvl1_vol
2200; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2201; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2202; GFX7-NEXT:    flat_store_dword v[0:1], v2
2203; GFX7-NEXT:    s_endpgm
2204;
2205; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
2206; GFX10-WGP:       ; %bb.0: ; %entry
2207; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2208; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2209; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2211; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2212; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2213; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2214; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2215; GFX10-WGP-NEXT:    buffer_gl1_inv
2216; GFX10-WGP-NEXT:    buffer_gl0_inv
2217; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2218; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2219; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2220; GFX10-WGP-NEXT:    s_endpgm
2221;
2222; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
2223; GFX10-CU:       ; %bb.0: ; %entry
2224; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2225; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2226; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2227; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2228; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2229; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2230; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2231; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2232; GFX10-CU-NEXT:    buffer_gl1_inv
2233; GFX10-CU-NEXT:    buffer_gl0_inv
2234; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2235; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2236; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2237; GFX10-CU-NEXT:    s_endpgm
2238;
2239; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw:
2240; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2241; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2242; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2246; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2247; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2248; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2251; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2252; SKIP-CACHE-INV-NEXT:    s_endpgm
2253;
2254; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
2255; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2256; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2257; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2258; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2259; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2260; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2261; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2262; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2263; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2264; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2265; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2266; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2267; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2268;
2269; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
2270; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2271; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2272; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2273; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2274; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2275; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2276; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2277; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2278; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2279; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2280; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2281; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2282; GFX90A-TGSPLIT-NEXT:    s_endpgm
2283;
2284; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
2285; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2286; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2287; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2288; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2289; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2290; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2291; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2292; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2293; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
2294; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2295; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2296; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2297;
2298; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
2299; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2300; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2301; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2302; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2303; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2304; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2305; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2306; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2307; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
2308; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2309; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2310; GFX940-TGSPLIT-NEXT:    s_endpgm
2311;
2312; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
2313; GFX11-WGP:       ; %bb.0: ; %entry
2314; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2315; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2316; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2318; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2319; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2320; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2321; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2322; GFX11-WGP-NEXT:    buffer_gl1_inv
2323; GFX11-WGP-NEXT:    buffer_gl0_inv
2324; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2325; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2326; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2327; GFX11-WGP-NEXT:    s_endpgm
2328;
2329; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw:
2330; GFX11-CU:       ; %bb.0: ; %entry
2331; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2332; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2333; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2334; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2335; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2336; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2337; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2338; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2339; GFX11-CU-NEXT:    buffer_gl1_inv
2340; GFX11-CU-NEXT:    buffer_gl0_inv
2341; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2342; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2343; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2344; GFX11-CU-NEXT:    s_endpgm
2345;
2346; GFX12-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
2347; GFX12-WGP:       ; %bb.0: ; %entry
2348; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2349; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2350; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2351; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2352; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2353; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2354; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2355; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2356; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
2357; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2358; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2359; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2360; GFX12-WGP-NEXT:    s_endpgm
2361;
2362; GFX12-CU-LABEL: flat_system_acquire_ret_atomicrmw:
2363; GFX12-CU:       ; %bb.0: ; %entry
2364; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2365; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2366; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2367; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2368; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2369; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2370; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2371; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2372; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
2373; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2374; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2375; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2376; GFX12-CU-NEXT:    s_endpgm
2377    ptr %out, i32 %in) {
2378entry:
2379  %val = atomicrmw volatile xchg ptr %out, i32 %in acquire
2380  store i32 %val, ptr %out, align 4
2381  ret void
2382}
2383
2384define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
2385; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
2386; GFX7:       ; %bb.0: ; %entry
2387; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2388; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2389; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2391; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2392; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2393; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2394; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2395; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2396; GFX7-NEXT:    buffer_wbinvl1_vol
2397; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2398; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2399; GFX7-NEXT:    flat_store_dword v[0:1], v2
2400; GFX7-NEXT:    s_endpgm
2401;
2402; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
2403; GFX10-WGP:       ; %bb.0: ; %entry
2404; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2405; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2406; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2407; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2408; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2409; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2410; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2411; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2412; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2413; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2414; GFX10-WGP-NEXT:    buffer_gl1_inv
2415; GFX10-WGP-NEXT:    buffer_gl0_inv
2416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2417; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2418; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2419; GFX10-WGP-NEXT:    s_endpgm
2420;
2421; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
2422; GFX10-CU:       ; %bb.0: ; %entry
2423; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2424; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2425; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2426; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2427; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2428; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2429; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2430; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2431; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2432; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2433; GFX10-CU-NEXT:    buffer_gl1_inv
2434; GFX10-CU-NEXT:    buffer_gl0_inv
2435; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2436; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2437; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2438; GFX10-CU-NEXT:    s_endpgm
2439;
2440; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw:
2441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2443; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2444; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2445; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2446; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2448; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2449; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2450; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2451; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2452; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2453; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2454; SKIP-CACHE-INV-NEXT:    s_endpgm
2455;
2456; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
2457; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2458; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2459; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2460; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2461; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2462; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2463; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2464; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2465; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2466; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2467; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2468; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2469; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2470; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2471; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2472;
2473; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
2474; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2475; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2476; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2477; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2479; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2480; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2481; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2482; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2483; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2484; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2485; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2486; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2487; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2488; GFX90A-TGSPLIT-NEXT:    s_endpgm
2489;
2490; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
2491; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2492; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2493; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2494; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2495; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2496; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2497; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2498; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2499; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2500; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2501; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
2502; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2503; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2504; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2505;
2506; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
2507; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2508; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2509; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2510; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2512; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2513; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2514; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2515; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2516; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2517; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
2518; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2519; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2520; GFX940-TGSPLIT-NEXT:    s_endpgm
2521;
2522; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
2523; GFX11-WGP:       ; %bb.0: ; %entry
2524; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2525; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2526; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2527; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2528; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2529; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2530; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2531; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2532; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2533; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2534; GFX11-WGP-NEXT:    buffer_gl1_inv
2535; GFX11-WGP-NEXT:    buffer_gl0_inv
2536; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2537; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2538; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2539; GFX11-WGP-NEXT:    s_endpgm
2540;
2541; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
2542; GFX11-CU:       ; %bb.0: ; %entry
2543; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2544; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2545; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2546; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2547; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2548; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2549; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2550; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2551; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2552; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2553; GFX11-CU-NEXT:    buffer_gl1_inv
2554; GFX11-CU-NEXT:    buffer_gl0_inv
2555; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2556; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2557; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2558; GFX11-CU-NEXT:    s_endpgm
2559;
2560; GFX12-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
2561; GFX12-WGP:       ; %bb.0: ; %entry
2562; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2563; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2564; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2565; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2566; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2567; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2568; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
2569; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2570; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2571; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2572; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2573; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2574; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2575; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2576; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2577; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
2578; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2579; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2580; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2581; GFX12-WGP-NEXT:    s_endpgm
2582;
2583; GFX12-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
2584; GFX12-CU:       ; %bb.0: ; %entry
2585; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2586; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2587; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2588; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2589; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2590; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2591; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
2592; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
2593; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
2594; GFX12-CU-NEXT:    s_wait_storecnt 0x0
2595; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2596; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2597; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
2598; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
2599; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2600; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
2601; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2602; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2603; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2604; GFX12-CU-NEXT:    s_endpgm
2605    ptr %out, i32 %in) {
2606entry:
2607  %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
2608  store i32 %val, ptr %out, align 4
2609  ret void
2610}
2611
2612define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
2613; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
2614; GFX7:       ; %bb.0: ; %entry
2615; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2616; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2617; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2618; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2619; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2620; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2621; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2622; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2623; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2624; GFX7-NEXT:    buffer_wbinvl1_vol
2625; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2626; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2627; GFX7-NEXT:    flat_store_dword v[0:1], v2
2628; GFX7-NEXT:    s_endpgm
2629;
2630; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
2631; GFX10-WGP:       ; %bb.0: ; %entry
2632; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2633; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2634; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2635; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2636; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2637; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2638; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2639; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2640; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2641; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2642; GFX10-WGP-NEXT:    buffer_gl1_inv
2643; GFX10-WGP-NEXT:    buffer_gl0_inv
2644; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2645; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2646; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2647; GFX10-WGP-NEXT:    s_endpgm
2648;
2649; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
2650; GFX10-CU:       ; %bb.0: ; %entry
2651; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2652; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2653; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2654; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2655; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2656; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2657; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2658; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2659; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2660; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2661; GFX10-CU-NEXT:    buffer_gl1_inv
2662; GFX10-CU-NEXT:    buffer_gl0_inv
2663; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2664; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2665; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2666; GFX10-CU-NEXT:    s_endpgm
2667;
2668; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw:
2669; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2670; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2671; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2672; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2673; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2674; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2675; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2676; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2677; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2678; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2679; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2681; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2682; SKIP-CACHE-INV-NEXT:    s_endpgm
2683;
2684; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
2685; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2686; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2687; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2688; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2689; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2690; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2691; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
2692; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2693; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2694; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2695; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
2696; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
2697; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2698; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2699; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2700;
2701; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
2702; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2703; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2704; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2705; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2706; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2707; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2708; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
2709; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2710; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2711; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2712; GFX90A-TGSPLIT-NEXT:    buffer_invl2
2713; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
2714; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2715; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2716; GFX90A-TGSPLIT-NEXT:    s_endpgm
2717;
2718; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
2719; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2720; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2721; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2722; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2723; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2724; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2725; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2726; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2727; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2728; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2729; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
2730; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2731; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2732; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2733;
2734; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
2735; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2736; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2737; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2738; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2739; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2740; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2741; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
2742; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2743; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
2744; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2745; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
2746; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2747; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2748; GFX940-TGSPLIT-NEXT:    s_endpgm
2749;
2750; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
2751; GFX11-WGP:       ; %bb.0: ; %entry
2752; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2753; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2754; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2755; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2756; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2757; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2758; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2759; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2760; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2761; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2762; GFX11-WGP-NEXT:    buffer_gl1_inv
2763; GFX11-WGP-NEXT:    buffer_gl0_inv
2764; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2765; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2766; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2767; GFX11-WGP-NEXT:    s_endpgm
2768;
2769; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
2770; GFX11-CU:       ; %bb.0: ; %entry
2771; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2772; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2773; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2774; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2775; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2776; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2777; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2778; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2779; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2780; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2781; GFX11-CU-NEXT:    buffer_gl1_inv
2782; GFX11-CU-NEXT:    buffer_gl0_inv
2783; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2784; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2785; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2786; GFX11-CU-NEXT:    s_endpgm
2787;
2788; GFX12-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
2789; GFX12-WGP:       ; %bb.0: ; %entry
2790; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2791; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2792; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2793; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2794; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2795; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2796; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
2797; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2798; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2799; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
2800; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2801; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2802; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
2803; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
2804; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2805; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
2806; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2807; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2808; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2809; GFX12-WGP-NEXT:    s_endpgm
2810;
2811; GFX12-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
2812; GFX12-CU:       ; %bb.0: ; %entry
2813; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2814; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2815; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2816; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2817; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2818; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2819; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
2820; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
2821; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
2822; GFX12-CU-NEXT:    s_wait_storecnt 0x0
2823; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2824; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2825; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
2826; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
2827; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2828; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
2829; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2830; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2831; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2832; GFX12-CU-NEXT:    s_endpgm
2833    ptr %out, i32 %in) {
2834entry:
2835  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
2836  store i32 %val, ptr %out, align 4
2837  ret void
2838}
2839
2840define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
2841; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2842; GFX7:       ; %bb.0: ; %entry
2843; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2844; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2845; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2846; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2847; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2848; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2849; GFX7-NEXT:    s_mov_b32 s4, s8
2850; GFX7-NEXT:    s_mov_b32 s5, s9
2851; GFX7-NEXT:    s_mov_b32 s9, s10
2852; GFX7-NEXT:    s_mov_b32 s8, s11
2853; GFX7-NEXT:    s_add_u32 s4, s4, s9
2854; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2855; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2856; GFX7-NEXT:    s_mov_b32 s5, s8
2857; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2858; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2859; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2860; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2861; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2862; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2863; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2864; GFX7-NEXT:    s_endpgm
2865;
2866; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2867; GFX10-WGP:       ; %bb.0: ; %entry
2868; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2869; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2870; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2871; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2872; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2873; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2874; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2875; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2876; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2877; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2878; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2879; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2880; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2881; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2882; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2883; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2884; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2885; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2886; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2887; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2888; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2889; GFX10-WGP-NEXT:    s_endpgm
2890;
2891; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2892; GFX10-CU:       ; %bb.0: ; %entry
2893; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2894; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2895; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2896; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2897; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2898; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2899; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2900; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2901; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2902; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2903; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2904; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2905; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2906; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2907; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2908; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2909; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2910; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2911; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2912; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2913; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2914; GFX10-CU-NEXT:    s_endpgm
2915;
2916; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2917; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2918; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2919; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2920; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2921; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2922; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2923; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2924; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2925; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2926; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
2927; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
2928; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
2929; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
2930; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2931; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
2932; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2934; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2935; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
2936; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2937; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2938; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2939; SKIP-CACHE-INV-NEXT:    s_endpgm
2940;
2941; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2942; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2943; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2944; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2945; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2946; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2947; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2948; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2949; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2950; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2951; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2952; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2953; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2954;
2955; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2956; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2957; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2958; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2959; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2960; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2961; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2962; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2963; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2964; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2965; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2966; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2967; GFX90A-TGSPLIT-NEXT:    s_endpgm
2968;
2969; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2970; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2971; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2972; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2973; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2974; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2975; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2976; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2977; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2978; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2979; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2980; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
2981; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2982;
2983; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2984; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2985; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2986; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2987; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2988; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2989; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2990; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2991; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2992; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2993; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2994; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
2995; GFX940-TGSPLIT-NEXT:    s_endpgm
2996;
2997; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
2998; GFX11-WGP:       ; %bb.0: ; %entry
2999; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3000; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3001; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3002; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3003; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3004; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3005; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3006; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3007; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3008; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3009; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3010; GFX11-WGP-NEXT:    s_endpgm
3011;
3012; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
3013; GFX11-CU:       ; %bb.0: ; %entry
3014; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3015; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3016; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3017; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3018; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3019; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3020; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3021; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3022; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3023; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3024; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3025; GFX11-CU-NEXT:    s_endpgm
3026;
3027; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
3028; GFX12-WGP:       ; %bb.0: ; %entry
3029; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3030; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3031; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3032; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3033; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3034; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3035; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3036; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3037; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3038; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3039; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3040; GFX12-WGP-NEXT:    s_endpgm
3041;
3042; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
3043; GFX12-CU:       ; %bb.0: ; %entry
3044; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3045; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3046; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3047; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3048; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3049; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3050; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3051; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3052; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3053; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3054; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3055; GFX12-CU-NEXT:    s_endpgm
3056    ptr %out, i32 %in, i32 %old) {
3057entry:
3058  %gep = getelementptr i32, ptr %out, i32 4
3059  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
3060  ret void
3061}
3062
3063define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
3064; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
3065; GFX7:       ; %bb.0: ; %entry
3066; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3067; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3068; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3069; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3070; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3071; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3072; GFX7-NEXT:    s_mov_b32 s4, s8
3073; GFX7-NEXT:    s_mov_b32 s5, s9
3074; GFX7-NEXT:    s_mov_b32 s9, s10
3075; GFX7-NEXT:    s_mov_b32 s8, s11
3076; GFX7-NEXT:    s_add_u32 s4, s4, s9
3077; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3078; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3079; GFX7-NEXT:    s_mov_b32 s5, s8
3080; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3081; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3082; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3083; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3084; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3085; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3086; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3087; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3088; GFX7-NEXT:    buffer_wbinvl1_vol
3089; GFX7-NEXT:    s_endpgm
3090;
3091; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
3092; GFX10-WGP:       ; %bb.0: ; %entry
3093; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3094; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3095; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3096; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3097; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3098; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3099; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3100; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3101; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3102; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3103; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3104; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3105; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3106; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3107; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3108; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3109; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3110; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3111; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3112; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3113; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3114; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3115; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3116; GFX10-WGP-NEXT:    buffer_gl1_inv
3117; GFX10-WGP-NEXT:    buffer_gl0_inv
3118; GFX10-WGP-NEXT:    s_endpgm
3119;
3120; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
3121; GFX10-CU:       ; %bb.0: ; %entry
3122; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3123; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3124; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3125; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3126; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3127; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3128; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3129; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3130; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3131; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3132; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3133; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3134; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3135; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3136; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3137; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3138; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3139; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3140; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3141; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3142; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3143; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3145; GFX10-CU-NEXT:    buffer_gl1_inv
3146; GFX10-CU-NEXT:    buffer_gl0_inv
3147; GFX10-CU-NEXT:    s_endpgm
3148;
3149; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg:
3150; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3151; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3152; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3153; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3154; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3155; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3156; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3157; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3158; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3159; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3160; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3161; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3162; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3163; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3164; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3165; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3166; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3167; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3168; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3169; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3170; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3171; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3172; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3173; SKIP-CACHE-INV-NEXT:    s_endpgm
3174;
3175; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
3176; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3177; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3178; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3179; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3180; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3181; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3182; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3183; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3184; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3185; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3186; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3187; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3188; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3189; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3190; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3191;
3192; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
3193; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3194; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3195; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3196; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3197; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3198; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3199; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3200; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3201; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3202; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3203; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3204; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3205; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3206; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3207; GFX90A-TGSPLIT-NEXT:    s_endpgm
3208;
3209; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
3210; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3211; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3212; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3213; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3214; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3215; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3216; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3217; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3218; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3219; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3220; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3221; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3222; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
3223; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3224;
3225; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
3226; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3227; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3228; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3229; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3230; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3231; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3232; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3233; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3234; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3235; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3236; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3237; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3238; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
3239; GFX940-TGSPLIT-NEXT:    s_endpgm
3240;
3241; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
3242; GFX11-WGP:       ; %bb.0: ; %entry
3243; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3244; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3245; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3246; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3247; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3248; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3249; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3250; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3251; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3252; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3253; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3254; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3255; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3256; GFX11-WGP-NEXT:    buffer_gl1_inv
3257; GFX11-WGP-NEXT:    buffer_gl0_inv
3258; GFX11-WGP-NEXT:    s_endpgm
3259;
3260; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
3261; GFX11-CU:       ; %bb.0: ; %entry
3262; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3263; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3264; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3265; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3266; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3267; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3268; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3269; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3270; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3271; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3272; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3273; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3274; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3275; GFX11-CU-NEXT:    buffer_gl1_inv
3276; GFX11-CU-NEXT:    buffer_gl0_inv
3277; GFX11-CU-NEXT:    s_endpgm
3278;
3279; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
3280; GFX12-WGP:       ; %bb.0: ; %entry
3281; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3282; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3283; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3284; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3285; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3286; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3287; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3288; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3289; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3290; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3291; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3292; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
3293; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
3294; GFX12-WGP-NEXT:    s_endpgm
3295;
3296; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
3297; GFX12-CU:       ; %bb.0: ; %entry
3298; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3299; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3300; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3301; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3302; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3303; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3304; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3305; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3306; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3307; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3308; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3309; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
3310; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
3311; GFX12-CU-NEXT:    s_endpgm
3312    ptr %out, i32 %in, i32 %old) {
3313entry:
3314  %gep = getelementptr i32, ptr %out, i32 4
3315  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
3316  ret void
3317}
3318
3319define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
3320; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
3321; GFX7:       ; %bb.0: ; %entry
3322; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3323; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3324; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3325; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3326; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3327; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3328; GFX7-NEXT:    s_mov_b32 s4, s8
3329; GFX7-NEXT:    s_mov_b32 s5, s9
3330; GFX7-NEXT:    s_mov_b32 s9, s10
3331; GFX7-NEXT:    s_mov_b32 s8, s11
3332; GFX7-NEXT:    s_add_u32 s4, s4, s9
3333; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3334; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3335; GFX7-NEXT:    s_mov_b32 s5, s8
3336; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3337; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3338; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3339; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3340; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3341; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3342; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3343; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3344; GFX7-NEXT:    s_endpgm
3345;
3346; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
3347; GFX10-WGP:       ; %bb.0: ; %entry
3348; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3349; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3350; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3351; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3352; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3353; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3354; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3355; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3356; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3357; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3358; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3359; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3360; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3361; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3362; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3363; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3364; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3365; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3366; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3367; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3368; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3369; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3370; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3371; GFX10-WGP-NEXT:    s_endpgm
3372;
3373; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
3374; GFX10-CU:       ; %bb.0: ; %entry
3375; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3376; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3377; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3378; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3379; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3380; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3381; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3382; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3383; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3384; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3385; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3386; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3387; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3388; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3389; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3390; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3391; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3392; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3393; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3394; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3395; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3396; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3397; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3398; GFX10-CU-NEXT:    s_endpgm
3399;
3400; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg:
3401; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3402; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3403; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3404; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3405; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3406; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3407; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3408; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3410; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3411; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3412; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3413; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3414; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3415; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3418; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3419; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3420; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3421; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3422; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3423; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3424; SKIP-CACHE-INV-NEXT:    s_endpgm
3425;
3426; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
3427; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3428; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3429; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3430; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3431; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3432; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3433; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3434; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3435; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3436; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3437; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3438; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3439; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3440; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3441;
3442; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
3443; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3444; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3445; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3446; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3447; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3448; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3449; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3450; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3451; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3452; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3453; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3454; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3455; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3456; GFX90A-TGSPLIT-NEXT:    s_endpgm
3457;
3458; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
3459; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3460; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3461; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3462; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3463; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3464; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3465; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3466; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3467; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3468; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3469; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
3470; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3471; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3472; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3473;
3474; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
3475; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3476; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3477; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3478; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3479; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3480; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3481; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3482; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3483; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3484; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3485; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
3486; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3487; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3488; GFX940-TGSPLIT-NEXT:    s_endpgm
3489;
3490; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
3491; GFX11-WGP:       ; %bb.0: ; %entry
3492; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3493; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3494; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3495; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3496; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3497; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3498; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3499; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3500; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3501; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3502; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3503; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3504; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3505; GFX11-WGP-NEXT:    s_endpgm
3506;
3507; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg:
3508; GFX11-CU:       ; %bb.0: ; %entry
3509; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3510; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3511; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3512; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3513; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3514; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3515; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3516; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3517; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3518; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3519; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3520; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3521; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3522; GFX11-CU-NEXT:    s_endpgm
3523;
3524; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
3525; GFX12-WGP:       ; %bb.0: ; %entry
3526; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3527; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3528; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3529; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3530; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3531; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3532; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3533; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3534; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3535; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3536; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
3537; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
3538; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
3539; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
3540; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
3541; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3542; GFX12-WGP-NEXT:    s_endpgm
3543;
3544; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg:
3545; GFX12-CU:       ; %bb.0: ; %entry
3546; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3547; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3548; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3549; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3550; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3551; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3552; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3553; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3554; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3555; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3556; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
3557; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
3558; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
3559; GFX12-CU-NEXT:    s_wait_storecnt 0x0
3560; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
3561; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3562; GFX12-CU-NEXT:    s_endpgm
3563    ptr %out, i32 %in, i32 %old) {
3564entry:
3565  %gep = getelementptr i32, ptr %out, i32 4
3566  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
3567  ret void
3568}
3569
3570define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
3571; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3572; GFX7:       ; %bb.0: ; %entry
3573; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3574; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3575; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3576; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3577; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3578; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3579; GFX7-NEXT:    s_mov_b32 s4, s8
3580; GFX7-NEXT:    s_mov_b32 s5, s9
3581; GFX7-NEXT:    s_mov_b32 s9, s10
3582; GFX7-NEXT:    s_mov_b32 s8, s11
3583; GFX7-NEXT:    s_add_u32 s4, s4, s9
3584; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3585; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3586; GFX7-NEXT:    s_mov_b32 s5, s8
3587; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3588; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3589; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3590; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3591; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3592; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3593; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3594; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3595; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3596; GFX7-NEXT:    buffer_wbinvl1_vol
3597; GFX7-NEXT:    s_endpgm
3598;
3599; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3600; GFX10-WGP:       ; %bb.0: ; %entry
3601; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3602; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3603; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3604; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3605; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3606; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3607; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3608; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3609; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3610; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3611; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3612; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3613; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3614; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3615; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3616; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3617; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3618; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3619; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3620; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3621; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3622; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3623; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3624; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3625; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3626; GFX10-WGP-NEXT:    buffer_gl1_inv
3627; GFX10-WGP-NEXT:    buffer_gl0_inv
3628; GFX10-WGP-NEXT:    s_endpgm
3629;
3630; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3631; GFX10-CU:       ; %bb.0: ; %entry
3632; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3633; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3634; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3635; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3636; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3637; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3638; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3639; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3640; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3641; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3642; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3643; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3644; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3645; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3646; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3647; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3648; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3649; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3650; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3651; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3652; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3653; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3654; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3655; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3656; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3657; GFX10-CU-NEXT:    buffer_gl1_inv
3658; GFX10-CU-NEXT:    buffer_gl0_inv
3659; GFX10-CU-NEXT:    s_endpgm
3660;
3661; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3662; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3663; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3664; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3665; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3666; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3667; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3668; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3669; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3670; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3671; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3672; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3673; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3674; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3675; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3676; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3679; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3680; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3681; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3683; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3684; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3685; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3686; SKIP-CACHE-INV-NEXT:    s_endpgm
3687;
3688; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3689; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3690; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3691; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3692; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3693; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3694; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3695; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3696; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3697; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3698; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3699; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3700; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3701; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3702; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3703; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3704; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3705; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3706;
3707; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3708; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3709; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3710; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3711; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3712; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3713; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3714; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3715; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3716; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3717; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3718; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
3719; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3720; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3721; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3722; GFX90A-TGSPLIT-NEXT:    buffer_invl2
3723; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
3724; GFX90A-TGSPLIT-NEXT:    s_endpgm
3725;
3726; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3727; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3728; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3729; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3730; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3731; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3732; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3733; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3734; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3735; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3736; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3737; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
3738; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3739; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3740; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3741; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
3742; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3743;
3744; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3745; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3746; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3747; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3748; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3749; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3751; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3752; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3753; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3754; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3755; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
3756; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3757; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
3758; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
3759; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
3760; GFX940-TGSPLIT-NEXT:    s_endpgm
3761;
3762; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3763; GFX11-WGP:       ; %bb.0: ; %entry
3764; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3765; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3766; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3767; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3768; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3769; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3770; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3771; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3772; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3773; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3774; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3775; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3776; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3777; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3778; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3779; GFX11-WGP-NEXT:    buffer_gl1_inv
3780; GFX11-WGP-NEXT:    buffer_gl0_inv
3781; GFX11-WGP-NEXT:    s_endpgm
3782;
3783; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3784; GFX11-CU:       ; %bb.0: ; %entry
3785; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3786; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3787; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3788; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3789; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3790; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3791; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3792; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3793; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3794; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3795; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3796; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3797; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3798; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3799; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3800; GFX11-CU-NEXT:    buffer_gl1_inv
3801; GFX11-CU-NEXT:    buffer_gl0_inv
3802; GFX11-CU-NEXT:    s_endpgm
3803;
3804; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3805; GFX12-WGP:       ; %bb.0: ; %entry
3806; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3807; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3808; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3809; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3810; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3811; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3812; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3813; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3814; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3815; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3816; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
3817; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
3818; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
3819; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
3820; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
3821; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3822; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
3823; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
3824; GFX12-WGP-NEXT:    s_endpgm
3825;
3826; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
3827; GFX12-CU:       ; %bb.0: ; %entry
3828; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3829; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3830; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3831; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3832; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3833; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3834; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3835; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3836; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3837; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3838; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
3839; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
3840; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
3841; GFX12-CU-NEXT:    s_wait_storecnt 0x0
3842; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
3843; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
3844; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
3845; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
3846; GFX12-CU-NEXT:    s_endpgm
3847    ptr %out, i32 %in, i32 %old) {
3848entry:
3849  %gep = getelementptr i32, ptr %out, i32 4
3850  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
3851  ret void
3852}
3853
3854define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
3855; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3856; GFX7:       ; %bb.0: ; %entry
3857; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3858; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3859; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3860; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3861; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3862; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3863; GFX7-NEXT:    s_mov_b32 s4, s8
3864; GFX7-NEXT:    s_mov_b32 s5, s9
3865; GFX7-NEXT:    s_mov_b32 s9, s10
3866; GFX7-NEXT:    s_mov_b32 s8, s11
3867; GFX7-NEXT:    s_add_u32 s4, s4, s9
3868; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3869; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3870; GFX7-NEXT:    s_mov_b32 s5, s8
3871; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3872; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3873; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3874; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3875; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3876; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3877; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3878; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3879; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3880; GFX7-NEXT:    buffer_wbinvl1_vol
3881; GFX7-NEXT:    s_endpgm
3882;
3883; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3884; GFX10-WGP:       ; %bb.0: ; %entry
3885; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3886; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3887; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3888; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3889; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3890; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3891; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3892; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3893; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3894; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3895; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3896; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3897; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3898; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3899; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3900; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3901; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3902; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3903; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3904; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3905; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3906; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3907; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3908; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3909; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3910; GFX10-WGP-NEXT:    buffer_gl1_inv
3911; GFX10-WGP-NEXT:    buffer_gl0_inv
3912; GFX10-WGP-NEXT:    s_endpgm
3913;
3914; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3915; GFX10-CU:       ; %bb.0: ; %entry
3916; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3917; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3918; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3919; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3920; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3921; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3922; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3923; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3924; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3925; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3926; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3927; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3928; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3929; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3930; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3931; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3932; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3933; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3934; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3935; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3936; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3937; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3938; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3939; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3940; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3941; GFX10-CU-NEXT:    buffer_gl1_inv
3942; GFX10-CU-NEXT:    buffer_gl0_inv
3943; GFX10-CU-NEXT:    s_endpgm
3944;
3945; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3946; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3947; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3948; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3949; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3950; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3951; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3952; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3953; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3954; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3955; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3957; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3958; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3959; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3960; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3962; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3963; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3967; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3968; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3969; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3970; SKIP-CACHE-INV-NEXT:    s_endpgm
3971;
3972; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3973; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3974; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3975; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3976; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3977; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3978; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3979; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3980; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3981; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3982; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3983; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
3984; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3985; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3986; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3987; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
3988; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
3989; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3990;
3991; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
3992; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3993; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3994; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3995; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3996; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3997; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3998; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3999; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4000; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4001; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4002; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4003; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4004; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4005; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4006; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4007; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4008; GFX90A-TGSPLIT-NEXT:    s_endpgm
4009;
4010; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4011; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4012; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4013; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4014; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4015; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4016; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4017; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4018; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4019; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4020; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4021; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
4022; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4023; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4024; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4025; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
4026; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4027;
4028; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4029; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4030; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4031; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4032; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4033; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4034; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4035; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4036; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4037; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4038; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4039; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
4040; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4041; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4042; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4043; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
4044; GFX940-TGSPLIT-NEXT:    s_endpgm
4045;
4046; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4047; GFX11-WGP:       ; %bb.0: ; %entry
4048; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4049; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4050; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4051; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4052; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4053; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4054; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4055; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4056; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4057; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4058; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4059; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4060; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4061; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4063; GFX11-WGP-NEXT:    buffer_gl1_inv
4064; GFX11-WGP-NEXT:    buffer_gl0_inv
4065; GFX11-WGP-NEXT:    s_endpgm
4066;
4067; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4068; GFX11-CU:       ; %bb.0: ; %entry
4069; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4070; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4071; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4072; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4073; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4074; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4075; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4076; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4077; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4078; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4079; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4080; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4081; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4082; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4083; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4084; GFX11-CU-NEXT:    buffer_gl1_inv
4085; GFX11-CU-NEXT:    buffer_gl0_inv
4086; GFX11-CU-NEXT:    s_endpgm
4087;
4088; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4089; GFX12-WGP:       ; %bb.0: ; %entry
4090; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4091; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4092; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4093; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4094; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4095; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4096; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4097; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4098; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4099; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4100; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
4101; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
4102; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
4103; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
4104; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
4105; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4106; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4107; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
4108; GFX12-WGP-NEXT:    s_endpgm
4109;
4110; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
4111; GFX12-CU:       ; %bb.0: ; %entry
4112; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4113; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4114; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4115; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4116; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4117; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4118; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4119; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4120; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4121; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4122; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
4123; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
4124; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
4125; GFX12-CU-NEXT:    s_wait_storecnt 0x0
4126; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
4127; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4128; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
4129; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
4130; GFX12-CU-NEXT:    s_endpgm
4131    ptr %out, i32 %in, i32 %old) {
4132entry:
4133  %gep = getelementptr i32, ptr %out, i32 4
4134  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
4135  ret void
4136}
4137
4138define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
4139; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
4140; GFX7:       ; %bb.0: ; %entry
4141; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4142; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4143; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4144; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4145; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4146; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4147; GFX7-NEXT:    s_mov_b32 s4, s8
4148; GFX7-NEXT:    s_mov_b32 s5, s9
4149; GFX7-NEXT:    s_mov_b32 s9, s10
4150; GFX7-NEXT:    s_mov_b32 s8, s11
4151; GFX7-NEXT:    s_add_u32 s4, s4, s9
4152; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4153; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4154; GFX7-NEXT:    s_mov_b32 s5, s8
4155; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4156; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4157; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4158; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4159; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4160; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4161; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4162; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4163; GFX7-NEXT:    buffer_wbinvl1_vol
4164; GFX7-NEXT:    s_endpgm
4165;
4166; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
4167; GFX10-WGP:       ; %bb.0: ; %entry
4168; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4169; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4170; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4171; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4172; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4173; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4174; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4175; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4176; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4177; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4178; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4179; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4180; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4181; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4182; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4183; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4184; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4185; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4186; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4187; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4188; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4189; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4190; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4191; GFX10-WGP-NEXT:    buffer_gl1_inv
4192; GFX10-WGP-NEXT:    buffer_gl0_inv
4193; GFX10-WGP-NEXT:    s_endpgm
4194;
4195; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
4196; GFX10-CU:       ; %bb.0: ; %entry
4197; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4198; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4199; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4200; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4201; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4202; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4203; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4204; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4205; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4206; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4207; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4208; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4209; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4210; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4211; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4212; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4213; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4214; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4215; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4216; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4217; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4218; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4219; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4220; GFX10-CU-NEXT:    buffer_gl1_inv
4221; GFX10-CU-NEXT:    buffer_gl0_inv
4222; GFX10-CU-NEXT:    s_endpgm
4223;
4224; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg:
4225; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4226; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4227; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4228; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4229; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4230; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4231; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4232; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4233; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4234; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4235; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4236; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4237; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4238; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4239; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4240; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4241; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4242; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4243; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4246; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4247; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4248; SKIP-CACHE-INV-NEXT:    s_endpgm
4249;
4250; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
4251; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4252; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4253; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4254; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4255; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4256; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4257; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4258; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4259; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4260; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4261; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4262; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4263; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4264; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4265; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4266;
4267; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
4268; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4269; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4270; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4271; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4272; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4273; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4274; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4275; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4276; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4277; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4278; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4279; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4280; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4281; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4282; GFX90A-TGSPLIT-NEXT:    s_endpgm
4283;
4284; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
4285; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4286; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4287; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4288; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4289; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4290; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4291; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4292; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4293; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4294; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4295; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4296; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4297; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
4298; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4299;
4300; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
4301; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4302; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4303; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4304; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4305; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4306; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4307; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4308; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4309; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4310; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4311; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4312; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4313; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
4314; GFX940-TGSPLIT-NEXT:    s_endpgm
4315;
4316; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
4317; GFX11-WGP:       ; %bb.0: ; %entry
4318; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4319; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4320; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4321; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4322; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4323; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4324; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4325; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4326; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4327; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4328; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4329; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4330; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4331; GFX11-WGP-NEXT:    buffer_gl1_inv
4332; GFX11-WGP-NEXT:    buffer_gl0_inv
4333; GFX11-WGP-NEXT:    s_endpgm
4334;
4335; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
4336; GFX11-CU:       ; %bb.0: ; %entry
4337; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4338; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4339; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4340; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4341; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4342; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4343; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4344; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4345; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4346; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4347; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4348; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4349; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4350; GFX11-CU-NEXT:    buffer_gl1_inv
4351; GFX11-CU-NEXT:    buffer_gl0_inv
4352; GFX11-CU-NEXT:    s_endpgm
4353;
4354; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
4355; GFX12-WGP:       ; %bb.0: ; %entry
4356; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4357; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4358; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4359; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4360; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4361; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4362; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4363; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4364; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4365; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4366; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4367; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4368; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
4369; GFX12-WGP-NEXT:    s_endpgm
4370;
4371; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
4372; GFX12-CU:       ; %bb.0: ; %entry
4373; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4374; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4375; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4376; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4377; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4378; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4379; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4380; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4381; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4382; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4383; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4384; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
4385; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
4386; GFX12-CU-NEXT:    s_endpgm
4387    ptr %out, i32 %in, i32 %old) {
4388entry:
4389  %gep = getelementptr i32, ptr %out, i32 4
4390  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
4391  ret void
4392}
4393
4394define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
4395; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
4396; GFX7:       ; %bb.0: ; %entry
4397; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4398; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4399; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4400; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4401; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4402; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4403; GFX7-NEXT:    s_mov_b32 s4, s8
4404; GFX7-NEXT:    s_mov_b32 s5, s9
4405; GFX7-NEXT:    s_mov_b32 s9, s10
4406; GFX7-NEXT:    s_mov_b32 s8, s11
4407; GFX7-NEXT:    s_add_u32 s4, s4, s9
4408; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4409; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4410; GFX7-NEXT:    s_mov_b32 s5, s8
4411; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4412; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4413; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4414; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4415; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4416; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4417; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4418; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4419; GFX7-NEXT:    buffer_wbinvl1_vol
4420; GFX7-NEXT:    s_endpgm
4421;
4422; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
4423; GFX10-WGP:       ; %bb.0: ; %entry
4424; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4425; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4426; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4427; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4428; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4429; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4430; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4431; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4432; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4433; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4434; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4435; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4436; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4437; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4438; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4439; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4440; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4441; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4442; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4443; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4444; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4445; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4446; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4447; GFX10-WGP-NEXT:    buffer_gl1_inv
4448; GFX10-WGP-NEXT:    buffer_gl0_inv
4449; GFX10-WGP-NEXT:    s_endpgm
4450;
4451; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
4452; GFX10-CU:       ; %bb.0: ; %entry
4453; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4454; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4455; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4456; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4457; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4458; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4459; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4460; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4461; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4462; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4463; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4464; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4465; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4466; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4467; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4468; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4469; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4470; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4471; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4472; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4473; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4474; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4475; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4476; GFX10-CU-NEXT:    buffer_gl1_inv
4477; GFX10-CU-NEXT:    buffer_gl0_inv
4478; GFX10-CU-NEXT:    s_endpgm
4479;
4480; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg:
4481; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4482; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4483; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4484; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4485; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4486; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4487; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4488; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4489; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4490; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4491; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4492; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4493; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4494; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4495; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4498; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4499; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4500; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4501; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4502; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4503; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4504; SKIP-CACHE-INV-NEXT:    s_endpgm
4505;
4506; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
4507; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4508; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4509; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4510; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4511; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4513; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4514; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4515; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4516; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4517; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4518; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4519; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4520; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4521; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4522;
4523; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
4524; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4525; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4526; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4527; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4528; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4529; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4530; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4531; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4532; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4533; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4534; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4535; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4536; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4537; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4538; GFX90A-TGSPLIT-NEXT:    s_endpgm
4539;
4540; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
4541; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4542; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4543; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4544; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4545; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4546; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4547; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4548; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4549; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4550; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4551; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4552; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4553; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
4554; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4555;
4556; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
4557; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4558; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4559; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4560; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4561; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4562; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4563; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4564; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4565; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4566; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4567; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4568; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4569; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
4570; GFX940-TGSPLIT-NEXT:    s_endpgm
4571;
4572; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
4573; GFX11-WGP:       ; %bb.0: ; %entry
4574; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4575; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4576; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4577; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4578; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4579; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4580; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4581; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4582; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4583; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4584; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4585; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4586; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4587; GFX11-WGP-NEXT:    buffer_gl1_inv
4588; GFX11-WGP-NEXT:    buffer_gl0_inv
4589; GFX11-WGP-NEXT:    s_endpgm
4590;
4591; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
4592; GFX11-CU:       ; %bb.0: ; %entry
4593; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4594; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4595; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4596; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4597; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4598; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4599; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4600; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4601; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4602; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4603; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4604; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4605; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4606; GFX11-CU-NEXT:    buffer_gl1_inv
4607; GFX11-CU-NEXT:    buffer_gl0_inv
4608; GFX11-CU-NEXT:    s_endpgm
4609;
4610; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
4611; GFX12-WGP:       ; %bb.0: ; %entry
4612; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4613; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4614; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4615; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4616; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4617; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4618; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4619; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4620; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4621; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4622; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4623; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4624; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
4625; GFX12-WGP-NEXT:    s_endpgm
4626;
4627; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
4628; GFX12-CU:       ; %bb.0: ; %entry
4629; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4630; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4631; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4632; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4633; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4634; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4635; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4636; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4637; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4638; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4639; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4640; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
4641; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
4642; GFX12-CU-NEXT:    s_endpgm
4643    ptr %out, i32 %in, i32 %old) {
4644entry:
4645  %gep = getelementptr i32, ptr %out, i32 4
4646  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
4647  ret void
4648}
4649
4650define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
4651; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
4652; GFX7:       ; %bb.0: ; %entry
4653; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4654; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4655; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4656; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4657; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4658; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4659; GFX7-NEXT:    s_mov_b32 s4, s8
4660; GFX7-NEXT:    s_mov_b32 s5, s9
4661; GFX7-NEXT:    s_mov_b32 s9, s10
4662; GFX7-NEXT:    s_mov_b32 s8, s11
4663; GFX7-NEXT:    s_add_u32 s4, s4, s9
4664; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4665; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4666; GFX7-NEXT:    s_mov_b32 s5, s8
4667; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4668; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4669; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4670; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4671; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4672; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4673; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4674; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4675; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4676; GFX7-NEXT:    buffer_wbinvl1_vol
4677; GFX7-NEXT:    s_endpgm
4678;
4679; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
4680; GFX10-WGP:       ; %bb.0: ; %entry
4681; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4682; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4683; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4684; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4685; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4686; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4687; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4688; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4689; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4690; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4691; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4692; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4693; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4694; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4695; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4696; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4697; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4698; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4699; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4700; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4701; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4702; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4703; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4704; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4705; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4706; GFX10-WGP-NEXT:    buffer_gl1_inv
4707; GFX10-WGP-NEXT:    buffer_gl0_inv
4708; GFX10-WGP-NEXT:    s_endpgm
4709;
4710; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
4711; GFX10-CU:       ; %bb.0: ; %entry
4712; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4713; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4714; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4715; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4716; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4717; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4718; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4719; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4720; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4721; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4722; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4723; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4724; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4725; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4726; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4727; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4728; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4729; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4730; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4731; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4732; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4733; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4734; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4735; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4736; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4737; GFX10-CU-NEXT:    buffer_gl1_inv
4738; GFX10-CU-NEXT:    buffer_gl0_inv
4739; GFX10-CU-NEXT:    s_endpgm
4740;
4741; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg:
4742; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4743; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4744; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4745; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4746; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4747; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4748; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4752; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4753; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4754; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4755; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4756; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4757; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4759; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4760; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4761; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4763; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4764; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4765; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4766; SKIP-CACHE-INV-NEXT:    s_endpgm
4767;
4768; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
4769; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4770; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4771; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4772; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4773; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4774; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4775; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4776; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4777; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4778; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4779; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
4780; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4781; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4782; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4783; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
4784; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
4785; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4786;
4787; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
4788; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4789; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4790; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4791; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4792; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4793; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4794; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4795; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4796; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4797; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4798; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
4799; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4800; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4801; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4802; GFX90A-TGSPLIT-NEXT:    buffer_invl2
4803; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
4804; GFX90A-TGSPLIT-NEXT:    s_endpgm
4805;
4806; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
4807; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4808; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4809; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4810; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4811; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4812; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4813; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4814; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4815; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4816; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4817; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
4818; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4819; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4820; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4821; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
4822; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4823;
4824; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
4825; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4826; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4827; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4828; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4829; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4830; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4831; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4832; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4833; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4834; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4835; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
4836; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4837; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
4838; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
4839; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
4840; GFX940-TGSPLIT-NEXT:    s_endpgm
4841;
4842; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg:
4843; GFX11-WGP:       ; %bb.0: ; %entry
4844; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4845; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4846; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4847; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4848; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4849; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4850; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4851; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4852; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4853; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4854; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4855; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4856; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4857; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4858; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4859; GFX11-WGP-NEXT:    buffer_gl1_inv
4860; GFX11-WGP-NEXT:    buffer_gl0_inv
4861; GFX11-WGP-NEXT:    s_endpgm
4862;
4863; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg:
4864; GFX11-CU:       ; %bb.0: ; %entry
4865; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4866; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4867; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4868; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4869; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4870; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4871; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4872; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4873; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4874; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4875; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4876; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4877; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4878; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4879; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4880; GFX11-CU-NEXT:    buffer_gl1_inv
4881; GFX11-CU-NEXT:    buffer_gl0_inv
4882; GFX11-CU-NEXT:    s_endpgm
4883;
4884; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg:
4885; GFX12-WGP:       ; %bb.0: ; %entry
4886; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4887; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4888; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4889; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4890; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4891; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4892; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4893; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4894; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4895; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4896; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
4897; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
4898; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
4899; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
4900; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
4901; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4902; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
4903; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
4904; GFX12-WGP-NEXT:    s_endpgm
4905;
4906; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg:
4907; GFX12-CU:       ; %bb.0: ; %entry
4908; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4909; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4910; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4911; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4912; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4913; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4914; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4915; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4916; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4917; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4918; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
4919; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
4920; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
4921; GFX12-CU-NEXT:    s_wait_storecnt 0x0
4922; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
4923; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
4924; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
4925; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
4926; GFX12-CU-NEXT:    s_endpgm
4927    ptr %out, i32 %in, i32 %old) {
4928entry:
4929  %gep = getelementptr i32, ptr %out, i32 4
4930  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
4931  ret void
4932}
4933
4934define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
4935; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
4936; GFX7:       ; %bb.0: ; %entry
4937; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4938; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4939; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4940; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4941; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4942; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4943; GFX7-NEXT:    s_mov_b32 s4, s8
4944; GFX7-NEXT:    s_mov_b32 s5, s9
4945; GFX7-NEXT:    s_mov_b32 s9, s10
4946; GFX7-NEXT:    s_mov_b32 s8, s11
4947; GFX7-NEXT:    s_add_u32 s4, s4, s9
4948; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4949; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4950; GFX7-NEXT:    s_mov_b32 s5, s8
4951; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4952; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4953; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4954; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4955; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4956; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4957; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4958; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4959; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4960; GFX7-NEXT:    buffer_wbinvl1_vol
4961; GFX7-NEXT:    s_endpgm
4962;
4963; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
4964; GFX10-WGP:       ; %bb.0: ; %entry
4965; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4966; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4967; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4968; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4969; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4970; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4971; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4972; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4973; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4974; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4975; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4976; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4977; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4978; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4979; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4980; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4981; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4982; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4983; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4984; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4985; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4986; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4987; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4988; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4989; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4990; GFX10-WGP-NEXT:    buffer_gl1_inv
4991; GFX10-WGP-NEXT:    buffer_gl0_inv
4992; GFX10-WGP-NEXT:    s_endpgm
4993;
4994; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
4995; GFX10-CU:       ; %bb.0: ; %entry
4996; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4997; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4998; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4999; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5000; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5001; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5002; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5003; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5004; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5005; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5006; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5007; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5008; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5009; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5010; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5011; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5012; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5013; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5014; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5015; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5016; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5017; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5018; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5019; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5020; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5021; GFX10-CU-NEXT:    buffer_gl1_inv
5022; GFX10-CU-NEXT:    buffer_gl0_inv
5023; GFX10-CU-NEXT:    s_endpgm
5024;
5025; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5026; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5027; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5028; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5029; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5030; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5031; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5032; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5033; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5034; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5035; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5036; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5037; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5038; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5039; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5040; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5041; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5042; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5043; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5044; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5045; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5046; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5047; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5048; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5049; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5050; SKIP-CACHE-INV-NEXT:    s_endpgm
5051;
5052; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5053; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5054; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5055; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5056; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5057; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5058; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5059; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5060; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5061; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5062; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5063; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5064; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5065; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5066; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5067; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5068; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5069; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5070;
5071; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5072; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5073; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5074; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5075; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5076; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5077; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5078; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5079; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5081; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5082; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5083; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5084; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5085; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5086; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5087; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5088; GFX90A-TGSPLIT-NEXT:    s_endpgm
5089;
5090; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5091; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5092; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5093; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5094; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5095; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5096; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5097; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5098; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5099; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5100; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5101; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5102; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5103; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5104; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5105; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
5106; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5107;
5108; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5109; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5110; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5111; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5112; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5113; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5114; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5115; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5116; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5117; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5118; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5119; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5120; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5121; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5122; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5123; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
5124; GFX940-TGSPLIT-NEXT:    s_endpgm
5125;
5126; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5127; GFX11-WGP:       ; %bb.0: ; %entry
5128; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5129; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5130; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5131; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5132; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5133; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5134; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5135; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5136; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5137; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5138; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5139; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5140; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5141; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5142; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5143; GFX11-WGP-NEXT:    buffer_gl1_inv
5144; GFX11-WGP-NEXT:    buffer_gl0_inv
5145; GFX11-WGP-NEXT:    s_endpgm
5146;
5147; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5148; GFX11-CU:       ; %bb.0: ; %entry
5149; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5150; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5151; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5152; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5153; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5154; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5155; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5156; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5157; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5158; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5159; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5160; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5161; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5162; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5163; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5164; GFX11-CU-NEXT:    buffer_gl1_inv
5165; GFX11-CU-NEXT:    buffer_gl0_inv
5166; GFX11-CU-NEXT:    s_endpgm
5167;
5168; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5169; GFX12-WGP:       ; %bb.0: ; %entry
5170; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5171; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5172; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5173; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5174; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5175; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5176; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5177; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5178; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5179; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5180; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
5181; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
5182; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
5183; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
5184; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5185; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5186; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
5187; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
5188; GFX12-WGP-NEXT:    s_endpgm
5189;
5190; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
5191; GFX12-CU:       ; %bb.0: ; %entry
5192; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5193; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5194; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5195; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5196; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5197; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5198; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5199; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5200; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5201; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5202; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
5203; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
5204; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
5205; GFX12-CU-NEXT:    s_wait_storecnt 0x0
5206; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
5207; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5208; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
5209; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
5210; GFX12-CU-NEXT:    s_endpgm
5211    ptr %out, i32 %in, i32 %old) {
5212entry:
5213  %gep = getelementptr i32, ptr %out, i32 4
5214  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
5215  ret void
5216}
5217
5218define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
5219; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5220; GFX7:       ; %bb.0: ; %entry
5221; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5222; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5223; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5224; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5225; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5226; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5227; GFX7-NEXT:    s_mov_b32 s4, s8
5228; GFX7-NEXT:    s_mov_b32 s5, s9
5229; GFX7-NEXT:    s_mov_b32 s9, s10
5230; GFX7-NEXT:    s_mov_b32 s8, s11
5231; GFX7-NEXT:    s_add_u32 s4, s4, s9
5232; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5233; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5234; GFX7-NEXT:    s_mov_b32 s5, s8
5235; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5236; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5237; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5238; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5239; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5240; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5241; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5242; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5243; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5244; GFX7-NEXT:    buffer_wbinvl1_vol
5245; GFX7-NEXT:    s_endpgm
5246;
5247; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5248; GFX10-WGP:       ; %bb.0: ; %entry
5249; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5250; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5251; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5252; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5253; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5254; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5255; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5256; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5257; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5258; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5259; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5260; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5261; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5262; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5263; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5264; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5265; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5266; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5267; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5268; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5269; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5270; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5271; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5272; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5273; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5274; GFX10-WGP-NEXT:    buffer_gl1_inv
5275; GFX10-WGP-NEXT:    buffer_gl0_inv
5276; GFX10-WGP-NEXT:    s_endpgm
5277;
5278; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5279; GFX10-CU:       ; %bb.0: ; %entry
5280; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5281; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5282; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5283; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5284; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5285; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5286; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5287; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5288; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5289; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5290; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5291; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5292; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5293; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5294; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5296; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5297; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5298; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5299; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5300; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5301; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5302; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5303; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5304; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5305; GFX10-CU-NEXT:    buffer_gl1_inv
5306; GFX10-CU-NEXT:    buffer_gl0_inv
5307; GFX10-CU-NEXT:    s_endpgm
5308;
5309; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5310; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5311; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5312; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5313; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5314; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5315; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5316; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5317; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5318; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5319; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5320; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5321; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5322; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5323; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5324; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5327; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5328; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5329; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5331; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5332; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5333; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5334; SKIP-CACHE-INV-NEXT:    s_endpgm
5335;
5336; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5340; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5341; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5342; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5343; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5344; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5345; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5346; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5347; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5348; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5349; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5350; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5351; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5352; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5353; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5354;
5355; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5356; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5357; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5358; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5359; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5360; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5361; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5362; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5363; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5364; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5365; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5366; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5367; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5368; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5369; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5370; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5371; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5372; GFX90A-TGSPLIT-NEXT:    s_endpgm
5373;
5374; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5375; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5376; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5377; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5378; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5379; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5380; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5381; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5382; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5383; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5384; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5385; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5386; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5387; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5388; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5389; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
5390; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5391;
5392; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5393; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5394; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5395; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5396; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5397; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5398; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5399; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5400; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5401; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5402; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5403; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5404; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5405; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5406; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5407; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
5408; GFX940-TGSPLIT-NEXT:    s_endpgm
5409;
5410; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5411; GFX11-WGP:       ; %bb.0: ; %entry
5412; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5413; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5414; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5415; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5416; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5417; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5418; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5419; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5420; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5421; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5422; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5423; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5424; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5425; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5426; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5427; GFX11-WGP-NEXT:    buffer_gl1_inv
5428; GFX11-WGP-NEXT:    buffer_gl0_inv
5429; GFX11-WGP-NEXT:    s_endpgm
5430;
5431; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5432; GFX11-CU:       ; %bb.0: ; %entry
5433; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5434; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5435; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5436; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5437; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5438; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5439; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5440; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5441; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5442; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5443; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5444; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5445; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5446; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5447; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5448; GFX11-CU-NEXT:    buffer_gl1_inv
5449; GFX11-CU-NEXT:    buffer_gl0_inv
5450; GFX11-CU-NEXT:    s_endpgm
5451;
5452; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5453; GFX12-WGP:       ; %bb.0: ; %entry
5454; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5455; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5456; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5457; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5458; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5459; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5460; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5461; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5462; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5463; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5464; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
5465; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
5466; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
5467; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
5468; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5469; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5470; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
5471; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
5472; GFX12-WGP-NEXT:    s_endpgm
5473;
5474; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
5475; GFX12-CU:       ; %bb.0: ; %entry
5476; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5477; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5478; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5479; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5480; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5481; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5482; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5483; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5484; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5485; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5486; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
5487; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
5488; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
5489; GFX12-CU-NEXT:    s_wait_storecnt 0x0
5490; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
5491; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5492; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
5493; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
5494; GFX12-CU-NEXT:    s_endpgm
5495    ptr %out, i32 %in, i32 %old) {
5496entry:
5497  %gep = getelementptr i32, ptr %out, i32 4
5498  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
5499  ret void
5500}
5501
5502define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
5503; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5504; GFX7:       ; %bb.0: ; %entry
5505; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5506; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5507; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5508; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5509; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5510; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5511; GFX7-NEXT:    s_mov_b32 s4, s8
5512; GFX7-NEXT:    s_mov_b32 s5, s9
5513; GFX7-NEXT:    s_mov_b32 s9, s10
5514; GFX7-NEXT:    s_mov_b32 s8, s11
5515; GFX7-NEXT:    s_add_u32 s4, s4, s9
5516; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5517; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5518; GFX7-NEXT:    s_mov_b32 s5, s8
5519; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5520; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5521; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5522; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5523; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5524; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5525; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5526; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5527; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5528; GFX7-NEXT:    buffer_wbinvl1_vol
5529; GFX7-NEXT:    s_endpgm
5530;
5531; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5532; GFX10-WGP:       ; %bb.0: ; %entry
5533; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5534; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5535; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5536; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5537; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5538; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5539; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5540; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5541; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5542; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5543; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5544; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5545; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5546; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5547; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5548; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5549; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5550; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5551; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5552; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5553; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5554; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5555; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5556; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5557; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5558; GFX10-WGP-NEXT:    buffer_gl1_inv
5559; GFX10-WGP-NEXT:    buffer_gl0_inv
5560; GFX10-WGP-NEXT:    s_endpgm
5561;
5562; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5563; GFX10-CU:       ; %bb.0: ; %entry
5564; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5565; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5566; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5567; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5568; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5569; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5570; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5571; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5572; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5573; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5574; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5575; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5576; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5577; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5578; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5579; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5580; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5581; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5582; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5583; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5584; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5585; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5586; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5587; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5588; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5589; GFX10-CU-NEXT:    buffer_gl1_inv
5590; GFX10-CU-NEXT:    buffer_gl0_inv
5591; GFX10-CU-NEXT:    s_endpgm
5592;
5593; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5594; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5595; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5596; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5597; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5598; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5599; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5600; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5601; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5602; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5603; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5604; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5605; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5606; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5607; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5608; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5609; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5610; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5611; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5612; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5613; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5615; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5616; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5617; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5618; SKIP-CACHE-INV-NEXT:    s_endpgm
5619;
5620; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5621; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5622; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5623; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5624; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5625; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5626; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5627; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5628; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5629; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5630; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5631; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5632; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5633; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5634; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5635; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5636; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5637; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5638;
5639; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5640; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5641; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5642; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5643; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5644; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5645; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5646; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5647; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5648; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5649; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5650; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5651; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5652; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5653; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5654; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5655; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5656; GFX90A-TGSPLIT-NEXT:    s_endpgm
5657;
5658; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5659; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5660; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5661; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5662; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5663; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5664; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5665; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5666; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5667; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5668; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5669; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5670; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5671; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5672; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5673; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
5674; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5675;
5676; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5677; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5678; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5679; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5680; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5681; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5682; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5683; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5684; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5685; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5686; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5687; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5688; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5689; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5690; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5691; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
5692; GFX940-TGSPLIT-NEXT:    s_endpgm
5693;
5694; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5695; GFX11-WGP:       ; %bb.0: ; %entry
5696; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5697; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5698; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5699; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5700; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5701; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5702; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5703; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5704; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5705; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5706; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5707; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5708; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5709; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5710; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5711; GFX11-WGP-NEXT:    buffer_gl1_inv
5712; GFX11-WGP-NEXT:    buffer_gl0_inv
5713; GFX11-WGP-NEXT:    s_endpgm
5714;
5715; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5716; GFX11-CU:       ; %bb.0: ; %entry
5717; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5718; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5719; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5720; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5721; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5722; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5723; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5724; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5725; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5726; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5727; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5728; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5729; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5730; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5731; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5732; GFX11-CU-NEXT:    buffer_gl1_inv
5733; GFX11-CU-NEXT:    buffer_gl0_inv
5734; GFX11-CU-NEXT:    s_endpgm
5735;
5736; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5737; GFX12-WGP:       ; %bb.0: ; %entry
5738; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5739; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5740; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5741; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5742; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5743; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5744; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5745; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5746; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5747; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5748; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
5749; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
5750; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
5751; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
5752; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
5753; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5754; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
5755; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
5756; GFX12-WGP-NEXT:    s_endpgm
5757;
5758; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
5759; GFX12-CU:       ; %bb.0: ; %entry
5760; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5761; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5762; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5763; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5764; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5765; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5766; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5767; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5768; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5769; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5770; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
5771; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
5772; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
5773; GFX12-CU-NEXT:    s_wait_storecnt 0x0
5774; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
5775; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
5776; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
5777; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
5778; GFX12-CU-NEXT:    s_endpgm
5779    ptr %out, i32 %in, i32 %old) {
5780entry:
5781  %gep = getelementptr i32, ptr %out, i32 4
5782  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
5783  ret void
5784}
5785
5786define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
5787; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5788; GFX7:       ; %bb.0: ; %entry
5789; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5790; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5791; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5792; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5793; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5794; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5795; GFX7-NEXT:    s_mov_b32 s4, s8
5796; GFX7-NEXT:    s_mov_b32 s5, s9
5797; GFX7-NEXT:    s_mov_b32 s9, s10
5798; GFX7-NEXT:    s_mov_b32 s8, s11
5799; GFX7-NEXT:    s_add_u32 s4, s4, s9
5800; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5801; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5802; GFX7-NEXT:    s_mov_b32 s5, s8
5803; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5804; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5805; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5806; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5807; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5808; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5809; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5810; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5811; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5812; GFX7-NEXT:    buffer_wbinvl1_vol
5813; GFX7-NEXT:    s_endpgm
5814;
5815; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5816; GFX10-WGP:       ; %bb.0: ; %entry
5817; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5818; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5819; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5820; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5821; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5822; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5823; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5824; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5825; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5826; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5827; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5828; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5829; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5830; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5831; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5832; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5833; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5834; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5835; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5836; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5837; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5838; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5839; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5840; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5841; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5842; GFX10-WGP-NEXT:    buffer_gl1_inv
5843; GFX10-WGP-NEXT:    buffer_gl0_inv
5844; GFX10-WGP-NEXT:    s_endpgm
5845;
5846; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5847; GFX10-CU:       ; %bb.0: ; %entry
5848; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5849; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5850; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5851; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5852; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5853; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5854; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5855; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5856; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5857; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5858; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5859; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5860; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5861; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5862; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5863; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5864; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5865; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5866; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5867; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5868; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5869; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5870; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5871; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5872; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5873; GFX10-CU-NEXT:    buffer_gl1_inv
5874; GFX10-CU-NEXT:    buffer_gl0_inv
5875; GFX10-CU-NEXT:    s_endpgm
5876;
5877; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5878; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5879; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5880; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5881; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5882; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5883; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5884; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5885; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5886; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5887; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5888; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5889; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5890; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5891; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5892; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5893; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5895; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5896; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5897; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5899; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5900; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5901; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5902; SKIP-CACHE-INV-NEXT:    s_endpgm
5903;
5904; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5905; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5906; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5907; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5909; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5910; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5911; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5912; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5913; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5914; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5915; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
5916; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5917; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5918; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5919; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
5920; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
5921; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5922;
5923; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5924; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5925; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5926; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5927; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5928; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5929; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5930; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5931; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5932; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5933; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5934; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
5935; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5936; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5937; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5938; GFX90A-TGSPLIT-NEXT:    buffer_invl2
5939; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
5940; GFX90A-TGSPLIT-NEXT:    s_endpgm
5941;
5942; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5943; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5944; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5945; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5946; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5947; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5948; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5949; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5950; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5951; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5952; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5953; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5954; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5955; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5956; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5957; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
5958; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5959;
5960; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5961; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5962; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5963; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5964; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5965; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5966; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5967; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5968; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5969; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5970; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5971; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
5972; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5973; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
5974; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5975; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
5976; GFX940-TGSPLIT-NEXT:    s_endpgm
5977;
5978; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
5979; GFX11-WGP:       ; %bb.0: ; %entry
5980; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5981; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5982; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5983; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5984; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5985; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5986; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5987; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5988; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5989; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5990; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5991; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5992; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5993; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5994; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5995; GFX11-WGP-NEXT:    buffer_gl1_inv
5996; GFX11-WGP-NEXT:    buffer_gl0_inv
5997; GFX11-WGP-NEXT:    s_endpgm
5998;
5999; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
6000; GFX11-CU:       ; %bb.0: ; %entry
6001; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6002; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6003; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6004; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6005; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6006; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6007; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6008; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6009; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6010; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6011; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6012; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6013; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6014; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6015; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6016; GFX11-CU-NEXT:    buffer_gl1_inv
6017; GFX11-CU-NEXT:    buffer_gl0_inv
6018; GFX11-CU-NEXT:    s_endpgm
6019;
6020; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
6021; GFX12-WGP:       ; %bb.0: ; %entry
6022; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6023; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6024; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6025; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6026; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6027; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6028; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6029; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6030; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6031; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6032; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
6033; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6034; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6035; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6036; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6037; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6038; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
6039; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
6040; GFX12-WGP-NEXT:    s_endpgm
6041;
6042; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
6043; GFX12-CU:       ; %bb.0: ; %entry
6044; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6045; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6046; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6047; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6048; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6049; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6050; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6051; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6052; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6053; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6054; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
6055; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
6056; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
6057; GFX12-CU-NEXT:    s_wait_storecnt 0x0
6058; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6059; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6060; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
6061; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
6062; GFX12-CU-NEXT:    s_endpgm
6063    ptr %out, i32 %in, i32 %old) {
6064entry:
6065  %gep = getelementptr i32, ptr %out, i32 4
6066  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
6067  ret void
6068}
6069
6070define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
6071; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
6072; GFX7:       ; %bb.0: ; %entry
6073; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
6074; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6075; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
6076; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
6077; GFX7-NEXT:    s_mov_b64 s[10:11], 16
6078; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6079; GFX7-NEXT:    s_mov_b32 s4, s8
6080; GFX7-NEXT:    s_mov_b32 s5, s9
6081; GFX7-NEXT:    s_mov_b32 s9, s10
6082; GFX7-NEXT:    s_mov_b32 s8, s11
6083; GFX7-NEXT:    s_add_u32 s4, s4, s9
6084; GFX7-NEXT:    s_addc_u32 s8, s5, s8
6085; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6086; GFX7-NEXT:    s_mov_b32 s5, s8
6087; GFX7-NEXT:    v_mov_b32_e32 v2, s7
6088; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6089; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6090; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6091; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6092; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6093; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6094; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6095; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6096; GFX7-NEXT:    buffer_wbinvl1_vol
6097; GFX7-NEXT:    s_endpgm
6098;
6099; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
6100; GFX10-WGP:       ; %bb.0: ; %entry
6101; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
6102; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6103; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
6104; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
6105; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
6106; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6107; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
6108; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
6109; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
6110; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
6111; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
6112; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
6113; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6114; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
6115; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
6116; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6117; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6118; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6119; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6120; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6121; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6122; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6123; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6124; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6125; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6126; GFX10-WGP-NEXT:    buffer_gl1_inv
6127; GFX10-WGP-NEXT:    buffer_gl0_inv
6128; GFX10-WGP-NEXT:    s_endpgm
6129;
6130; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
6131; GFX10-CU:       ; %bb.0: ; %entry
6132; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
6133; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6134; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
6135; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
6136; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
6137; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6138; GFX10-CU-NEXT:    s_mov_b32 s4, s8
6139; GFX10-CU-NEXT:    s_mov_b32 s5, s9
6140; GFX10-CU-NEXT:    s_mov_b32 s9, s10
6141; GFX10-CU-NEXT:    s_mov_b32 s8, s11
6142; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
6143; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
6144; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6145; GFX10-CU-NEXT:    s_mov_b32 s5, s8
6146; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
6147; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6148; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6149; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6150; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6151; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6152; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6153; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6154; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6155; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6156; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6157; GFX10-CU-NEXT:    buffer_gl1_inv
6158; GFX10-CU-NEXT:    buffer_gl0_inv
6159; GFX10-CU-NEXT:    s_endpgm
6160;
6161; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg:
6162; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6163; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
6164; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
6165; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
6166; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
6167; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
6168; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6169; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
6170; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
6171; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
6172; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
6173; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
6174; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
6175; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
6176; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
6177; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
6178; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6179; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6180; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6182; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6183; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6184; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6185; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6186; SKIP-CACHE-INV-NEXT:    s_endpgm
6187;
6188; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
6189; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6190; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6191; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6192; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6193; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6194; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6195; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6196; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6197; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6198; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6199; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6200; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6201; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6202; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6203; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6204; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6205; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6206;
6207; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
6208; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6209; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6210; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6211; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6212; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6213; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6214; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6215; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6216; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6217; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6218; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6219; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6220; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6221; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6222; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6223; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6224; GFX90A-TGSPLIT-NEXT:    s_endpgm
6225;
6226; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
6227; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6228; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6229; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6230; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6231; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6232; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6233; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6234; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6235; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6236; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6237; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6238; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6239; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6240; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6241; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
6242; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6243;
6244; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
6245; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6246; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6247; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6248; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6249; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6250; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6251; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6252; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6253; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6254; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6255; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6256; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6257; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6258; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6259; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
6260; GFX940-TGSPLIT-NEXT:    s_endpgm
6261;
6262; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
6263; GFX11-WGP:       ; %bb.0: ; %entry
6264; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6265; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6266; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6267; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6268; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6269; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6270; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6271; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6272; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6273; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6274; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6275; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6276; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6277; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6278; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6279; GFX11-WGP-NEXT:    buffer_gl1_inv
6280; GFX11-WGP-NEXT:    buffer_gl0_inv
6281; GFX11-WGP-NEXT:    s_endpgm
6282;
6283; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
6284; GFX11-CU:       ; %bb.0: ; %entry
6285; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6286; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6287; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6288; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6289; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6290; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6291; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6292; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6293; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6294; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6295; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6296; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6297; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6298; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6299; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6300; GFX11-CU-NEXT:    buffer_gl1_inv
6301; GFX11-CU-NEXT:    buffer_gl0_inv
6302; GFX11-CU-NEXT:    s_endpgm
6303;
6304; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
6305; GFX12-WGP:       ; %bb.0: ; %entry
6306; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6307; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6308; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6309; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6310; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6311; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6312; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6313; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6314; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6315; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6316; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
6317; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6318; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6319; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6320; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6321; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6322; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
6323; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
6324; GFX12-WGP-NEXT:    s_endpgm
6325;
6326; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
6327; GFX12-CU:       ; %bb.0: ; %entry
6328; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6329; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6330; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6331; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6332; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6333; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6334; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6335; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6336; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6337; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6338; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
6339; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
6340; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
6341; GFX12-CU-NEXT:    s_wait_storecnt 0x0
6342; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6343; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6344; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
6345; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
6346; GFX12-CU-NEXT:    s_endpgm
6347    ptr %out, i32 %in, i32 %old) {
6348entry:
6349  %gep = getelementptr i32, ptr %out, i32 4
6350  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
6351  ret void
6352}
6353
6354define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
6355; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6356; GFX7:       ; %bb.0: ; %entry
6357; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
6358; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6359; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
6360; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
6361; GFX7-NEXT:    s_mov_b64 s[10:11], 16
6362; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6363; GFX7-NEXT:    s_mov_b32 s4, s8
6364; GFX7-NEXT:    s_mov_b32 s5, s9
6365; GFX7-NEXT:    s_mov_b32 s9, s10
6366; GFX7-NEXT:    s_mov_b32 s8, s11
6367; GFX7-NEXT:    s_add_u32 s4, s4, s9
6368; GFX7-NEXT:    s_addc_u32 s8, s5, s8
6369; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6370; GFX7-NEXT:    s_mov_b32 s5, s8
6371; GFX7-NEXT:    v_mov_b32_e32 v2, s7
6372; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6373; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6374; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6375; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6376; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6377; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6378; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6379; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6380; GFX7-NEXT:    buffer_wbinvl1_vol
6381; GFX7-NEXT:    s_endpgm
6382;
6383; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6384; GFX10-WGP:       ; %bb.0: ; %entry
6385; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
6386; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6387; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
6388; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
6389; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
6390; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6391; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
6392; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
6393; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
6394; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
6395; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
6396; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
6397; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6398; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
6399; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
6400; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6401; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6402; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6403; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6404; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6405; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6406; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6407; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6408; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6409; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6410; GFX10-WGP-NEXT:    buffer_gl1_inv
6411; GFX10-WGP-NEXT:    buffer_gl0_inv
6412; GFX10-WGP-NEXT:    s_endpgm
6413;
6414; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6415; GFX10-CU:       ; %bb.0: ; %entry
6416; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
6417; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6418; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
6419; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
6420; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
6421; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6422; GFX10-CU-NEXT:    s_mov_b32 s4, s8
6423; GFX10-CU-NEXT:    s_mov_b32 s5, s9
6424; GFX10-CU-NEXT:    s_mov_b32 s9, s10
6425; GFX10-CU-NEXT:    s_mov_b32 s8, s11
6426; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
6427; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
6428; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6429; GFX10-CU-NEXT:    s_mov_b32 s5, s8
6430; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
6431; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6432; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6433; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6434; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6435; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6436; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6437; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6438; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6439; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6440; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6441; GFX10-CU-NEXT:    buffer_gl1_inv
6442; GFX10-CU-NEXT:    buffer_gl0_inv
6443; GFX10-CU-NEXT:    s_endpgm
6444;
6445; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6446; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6447; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
6448; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
6449; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
6450; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
6451; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
6452; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6453; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
6454; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
6455; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
6456; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
6457; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
6458; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
6459; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
6460; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
6461; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
6462; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6463; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6464; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6465; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6466; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6467; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6468; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6469; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6470; SKIP-CACHE-INV-NEXT:    s_endpgm
6471;
6472; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6473; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6474; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6475; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6476; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6477; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6478; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6479; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6480; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6481; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6482; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6483; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6484; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6485; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6486; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6487; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6488; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6489; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6490;
6491; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6492; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6493; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6494; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6495; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6496; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6497; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6498; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6499; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6500; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6501; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6502; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6503; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6504; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6505; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6506; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6507; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6508; GFX90A-TGSPLIT-NEXT:    s_endpgm
6509;
6510; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6511; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6512; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6513; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6514; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6515; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6516; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6517; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6518; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6519; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6520; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6521; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6522; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6523; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6524; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6525; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
6526; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6527;
6528; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6529; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6530; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6531; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6532; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6533; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6534; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6535; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6536; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6537; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6538; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6539; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6540; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6541; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6542; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6543; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
6544; GFX940-TGSPLIT-NEXT:    s_endpgm
6545;
6546; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6547; GFX11-WGP:       ; %bb.0: ; %entry
6548; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6549; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6550; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6551; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6552; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6553; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6554; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6555; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6556; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6557; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6558; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6559; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6560; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6561; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6562; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6563; GFX11-WGP-NEXT:    buffer_gl1_inv
6564; GFX11-WGP-NEXT:    buffer_gl0_inv
6565; GFX11-WGP-NEXT:    s_endpgm
6566;
6567; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6568; GFX11-CU:       ; %bb.0: ; %entry
6569; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6570; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6571; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6572; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6573; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6574; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6575; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6576; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6577; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6578; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6579; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6580; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6581; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6582; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6583; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6584; GFX11-CU-NEXT:    buffer_gl1_inv
6585; GFX11-CU-NEXT:    buffer_gl0_inv
6586; GFX11-CU-NEXT:    s_endpgm
6587;
6588; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6589; GFX12-WGP:       ; %bb.0: ; %entry
6590; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6591; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6592; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6593; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6594; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6595; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6596; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6597; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6598; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6599; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6600; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
6601; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6602; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6603; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6604; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6605; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6606; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
6607; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
6608; GFX12-WGP-NEXT:    s_endpgm
6609;
6610; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
6611; GFX12-CU:       ; %bb.0: ; %entry
6612; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6613; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6614; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6615; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6616; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6617; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6618; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6619; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6620; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6621; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6622; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
6623; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
6624; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
6625; GFX12-CU-NEXT:    s_wait_storecnt 0x0
6626; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6627; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6628; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
6629; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
6630; GFX12-CU-NEXT:    s_endpgm
6631    ptr %out, i32 %in, i32 %old) {
6632entry:
6633  %gep = getelementptr i32, ptr %out, i32 4
6634  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
6635  ret void
6636}
6637
6638define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
6639; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6640; GFX7:       ; %bb.0: ; %entry
6641; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
6642; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6643; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
6644; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
6645; GFX7-NEXT:    s_mov_b64 s[10:11], 16
6646; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6647; GFX7-NEXT:    s_mov_b32 s4, s8
6648; GFX7-NEXT:    s_mov_b32 s5, s9
6649; GFX7-NEXT:    s_mov_b32 s9, s10
6650; GFX7-NEXT:    s_mov_b32 s8, s11
6651; GFX7-NEXT:    s_add_u32 s4, s4, s9
6652; GFX7-NEXT:    s_addc_u32 s8, s5, s8
6653; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6654; GFX7-NEXT:    s_mov_b32 s5, s8
6655; GFX7-NEXT:    v_mov_b32_e32 v2, s7
6656; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6657; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6658; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6659; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6660; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6661; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6662; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6663; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6664; GFX7-NEXT:    buffer_wbinvl1_vol
6665; GFX7-NEXT:    s_endpgm
6666;
6667; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6668; GFX10-WGP:       ; %bb.0: ; %entry
6669; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
6670; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6671; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
6672; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
6673; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
6674; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6675; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
6676; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
6677; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
6678; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
6679; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
6680; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
6681; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6682; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
6683; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
6684; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6685; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6686; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6687; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6688; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6689; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6690; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6691; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6692; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6693; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6694; GFX10-WGP-NEXT:    buffer_gl1_inv
6695; GFX10-WGP-NEXT:    buffer_gl0_inv
6696; GFX10-WGP-NEXT:    s_endpgm
6697;
6698; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6699; GFX10-CU:       ; %bb.0: ; %entry
6700; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
6701; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
6702; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
6703; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
6704; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
6705; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6706; GFX10-CU-NEXT:    s_mov_b32 s4, s8
6707; GFX10-CU-NEXT:    s_mov_b32 s5, s9
6708; GFX10-CU-NEXT:    s_mov_b32 s9, s10
6709; GFX10-CU-NEXT:    s_mov_b32 s8, s11
6710; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
6711; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
6712; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
6713; GFX10-CU-NEXT:    s_mov_b32 s5, s8
6714; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
6715; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6716; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6717; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6718; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6719; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6720; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6721; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6722; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6723; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6724; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6725; GFX10-CU-NEXT:    buffer_gl1_inv
6726; GFX10-CU-NEXT:    buffer_gl0_inv
6727; GFX10-CU-NEXT:    s_endpgm
6728;
6729; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6730; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6731; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
6732; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
6733; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
6734; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
6735; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
6736; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6737; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
6738; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
6739; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
6740; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
6741; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
6742; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
6743; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
6744; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
6745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
6746; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6747; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6748; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6749; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6750; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6751; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6752; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
6753; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6754; SKIP-CACHE-INV-NEXT:    s_endpgm
6755;
6756; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6757; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6758; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6759; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6760; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6761; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6762; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6763; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6764; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6765; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6766; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6767; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
6768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6769; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6770; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6771; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
6772; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
6773; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6774;
6775; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6776; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6777; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6778; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6779; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6780; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6781; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6782; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6783; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6784; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6785; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6786; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
6787; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6788; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
6789; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6790; GFX90A-TGSPLIT-NEXT:    buffer_invl2
6791; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
6792; GFX90A-TGSPLIT-NEXT:    s_endpgm
6793;
6794; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6795; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6796; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6797; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6798; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6799; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6800; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6802; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6803; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6804; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6805; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6806; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6807; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6808; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6809; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
6810; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6811;
6812; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6813; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6814; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6815; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6816; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6817; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6820; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6821; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6822; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6823; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
6824; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6825; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
6826; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6827; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
6828; GFX940-TGSPLIT-NEXT:    s_endpgm
6829;
6830; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6831; GFX11-WGP:       ; %bb.0: ; %entry
6832; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6833; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6834; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6835; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6836; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6837; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6838; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6839; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6840; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6841; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6842; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6843; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6844; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6845; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6846; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
6847; GFX11-WGP-NEXT:    buffer_gl1_inv
6848; GFX11-WGP-NEXT:    buffer_gl0_inv
6849; GFX11-WGP-NEXT:    s_endpgm
6850;
6851; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6852; GFX11-CU:       ; %bb.0: ; %entry
6853; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6854; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6855; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6856; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6857; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6858; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6859; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6860; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6861; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6862; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6863; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6864; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6865; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
6866; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6867; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
6868; GFX11-CU-NEXT:    buffer_gl1_inv
6869; GFX11-CU-NEXT:    buffer_gl0_inv
6870; GFX11-CU-NEXT:    s_endpgm
6871;
6872; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6873; GFX12-WGP:       ; %bb.0: ; %entry
6874; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6875; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6876; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6877; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6878; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6879; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6880; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6881; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6882; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6883; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6884; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
6885; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
6886; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
6887; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
6888; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6889; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6890; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
6891; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
6892; GFX12-WGP-NEXT:    s_endpgm
6893;
6894; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
6895; GFX12-CU:       ; %bb.0: ; %entry
6896; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6897; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6898; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6899; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6900; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6901; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6902; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6903; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6904; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6905; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6906; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
6907; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
6908; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
6909; GFX12-CU-NEXT:    s_wait_storecnt 0x0
6910; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6911; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
6912; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
6913; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
6914; GFX12-CU-NEXT:    s_endpgm
6915    ptr %out, i32 %in, i32 %old) {
6916entry:
6917  %gep = getelementptr i32, ptr %out, i32 4
6918  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
6919  ret void
6920}
6921
6922define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
6923; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
6924; GFX7:       ; %bb.0: ; %entry
6925; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6926; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6927; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6928; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6929; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6930; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6931; GFX7-NEXT:    s_mov_b32 s6, s4
6932; GFX7-NEXT:    s_mov_b32 s7, s5
6933; GFX7-NEXT:    s_mov_b32 s11, s12
6934; GFX7-NEXT:    s_mov_b32 s10, s13
6935; GFX7-NEXT:    s_add_u32 s6, s6, s11
6936; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6937; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6938; GFX7-NEXT:    s_mov_b32 s7, s10
6939; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6940; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6941; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6942; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6943; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6944; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6945; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6946; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6947; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6948; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6949; GFX7-NEXT:    flat_store_dword v[0:1], v2
6950; GFX7-NEXT:    s_endpgm
6951;
6952; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
6953; GFX10-WGP:       ; %bb.0: ; %entry
6954; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6955; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6956; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6957; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6958; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6959; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6960; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6961; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6962; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6963; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6964; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6965; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6966; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6967; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6968; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6969; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6970; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6971; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6972; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6973; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6974; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6975; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6976; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6977; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6978; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6979; GFX10-WGP-NEXT:    s_endpgm
6980;
6981; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
6982; GFX10-CU:       ; %bb.0: ; %entry
6983; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6984; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6985; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6986; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6987; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6988; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6989; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6990; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6991; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6992; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6993; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6994; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6995; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6996; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6997; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6998; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6999; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7000; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7001; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7002; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7003; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7004; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7005; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7006; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7007; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7008; GFX10-CU-NEXT:    s_endpgm
7009;
7010; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7011; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7012; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7013; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7014; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7015; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7016; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7017; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7018; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7019; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7020; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7021; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7022; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7023; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7024; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7025; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7026; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7027; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7028; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7029; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7030; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7031; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7032; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7033; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7034; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7035; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7036; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7037; SKIP-CACHE-INV-NEXT:    s_endpgm
7038;
7039; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7040; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7041; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7042; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7043; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7044; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7045; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7046; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7047; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7048; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7049; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7050; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7051; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7052; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7053; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7054; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7055;
7056; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7057; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7058; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7059; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7060; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7061; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7062; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7063; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7064; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7065; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7066; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7067; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7068; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7069; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7070; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7071; GFX90A-TGSPLIT-NEXT:    s_endpgm
7072;
7073; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7074; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7075; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7076; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7077; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7078; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7079; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7080; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7081; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7082; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7083; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7084; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7085; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7086; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7087; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7088; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7089;
7090; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7091; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7092; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7093; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7094; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7095; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7096; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7097; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7098; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7099; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7100; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7101; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7102; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7103; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7104; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7105; GFX940-TGSPLIT-NEXT:    s_endpgm
7106;
7107; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7108; GFX11-WGP:       ; %bb.0: ; %entry
7109; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7110; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7111; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7112; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7113; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7114; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7115; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7116; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7117; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7118; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7119; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7120; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7121; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7122; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7123; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7124; GFX11-WGP-NEXT:    s_endpgm
7125;
7126; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7127; GFX11-CU:       ; %bb.0: ; %entry
7128; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7129; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7130; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7131; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7132; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7133; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7134; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7135; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7136; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7137; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7138; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7139; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7140; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7141; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7142; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7143; GFX11-CU-NEXT:    s_endpgm
7144;
7145; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7146; GFX12-WGP:       ; %bb.0: ; %entry
7147; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7148; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7149; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7150; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7151; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7152; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7153; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7154; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7155; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7156; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7157; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7158; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7159; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7160; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7161; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7162; GFX12-WGP-NEXT:    s_endpgm
7163;
7164; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
7165; GFX12-CU:       ; %bb.0: ; %entry
7166; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7167; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7168; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7169; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7170; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7171; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7172; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7173; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7174; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7175; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7176; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7177; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7178; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7179; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7180; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7181; GFX12-CU-NEXT:    s_endpgm
7182    ptr %out, i32 %in, i32 %old) {
7183entry:
7184  %gep = getelementptr i32, ptr %out, i32 4
7185  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
7186  %val0 = extractvalue { i32, i1 } %val, 0
7187  store i32 %val0, ptr %out, align 4
7188  ret void
7189}
7190
7191define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
7192; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7193; GFX7:       ; %bb.0: ; %entry
7194; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7195; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7196; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7197; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7198; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7199; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7200; GFX7-NEXT:    s_mov_b32 s6, s4
7201; GFX7-NEXT:    s_mov_b32 s7, s5
7202; GFX7-NEXT:    s_mov_b32 s11, s12
7203; GFX7-NEXT:    s_mov_b32 s10, s13
7204; GFX7-NEXT:    s_add_u32 s6, s6, s11
7205; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7206; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7207; GFX7-NEXT:    s_mov_b32 s7, s10
7208; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7209; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7210; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7211; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7212; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7213; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7214; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7215; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7216; GFX7-NEXT:    buffer_wbinvl1_vol
7217; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7218; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7219; GFX7-NEXT:    flat_store_dword v[0:1], v2
7220; GFX7-NEXT:    s_endpgm
7221;
7222; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7223; GFX10-WGP:       ; %bb.0: ; %entry
7224; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7225; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7226; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7227; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7228; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7229; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7230; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7231; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7232; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7233; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7234; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7235; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7236; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7237; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7238; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7240; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7241; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7242; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7243; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7244; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7245; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7246; GFX10-WGP-NEXT:    buffer_gl1_inv
7247; GFX10-WGP-NEXT:    buffer_gl0_inv
7248; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7249; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7250; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7251; GFX10-WGP-NEXT:    s_endpgm
7252;
7253; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7254; GFX10-CU:       ; %bb.0: ; %entry
7255; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7256; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7257; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7258; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7259; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7260; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7261; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7262; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7263; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7264; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7265; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7266; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7267; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7268; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7269; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7270; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7271; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7272; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7273; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7274; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7275; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7276; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7277; GFX10-CU-NEXT:    buffer_gl1_inv
7278; GFX10-CU-NEXT:    buffer_gl0_inv
7279; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7280; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7281; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7282; GFX10-CU-NEXT:    s_endpgm
7283;
7284; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7285; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7286; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7287; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7288; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7289; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7290; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7291; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7292; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7293; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7294; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7295; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7296; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7297; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7298; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7299; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7301; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7302; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7303; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7304; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7305; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7306; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7307; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7308; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7310; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7311; SKIP-CACHE-INV-NEXT:    s_endpgm
7312;
7313; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7314; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7316; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7317; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7318; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7319; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7320; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7321; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7322; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7323; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7324; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7325; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7326; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7327; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7328; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7329; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7330; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7331;
7332; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7333; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7334; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7335; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7336; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7337; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7338; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7339; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7340; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7342; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7343; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7344; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7345; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7346; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7347; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7348; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7349; GFX90A-TGSPLIT-NEXT:    s_endpgm
7350;
7351; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7352; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7353; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7354; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7355; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7356; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7357; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7358; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7359; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7360; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7361; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7362; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7363; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7364; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
7365; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7366; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7367; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7368;
7369; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7370; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7371; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7372; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7373; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7374; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7375; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7376; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7377; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7378; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7379; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7380; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7381; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7382; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
7383; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7384; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7385; GFX940-TGSPLIT-NEXT:    s_endpgm
7386;
7387; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7388; GFX11-WGP:       ; %bb.0: ; %entry
7389; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7390; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7391; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7392; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7393; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7394; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7395; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7396; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7397; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7398; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7399; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7400; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7401; GFX11-WGP-NEXT:    buffer_gl1_inv
7402; GFX11-WGP-NEXT:    buffer_gl0_inv
7403; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7404; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7405; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7406; GFX11-WGP-NEXT:    s_endpgm
7407;
7408; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7409; GFX11-CU:       ; %bb.0: ; %entry
7410; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7411; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7412; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7413; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7414; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7415; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7416; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7417; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7418; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7419; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7420; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7421; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7422; GFX11-CU-NEXT:    buffer_gl1_inv
7423; GFX11-CU-NEXT:    buffer_gl0_inv
7424; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7425; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7426; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7427; GFX11-CU-NEXT:    s_endpgm
7428;
7429; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7430; GFX12-WGP:       ; %bb.0: ; %entry
7431; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7432; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7433; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7434; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7435; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7436; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7437; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7438; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7439; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7440; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7441; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7442; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7443; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
7444; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7445; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7446; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7447; GFX12-WGP-NEXT:    s_endpgm
7448;
7449; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
7450; GFX12-CU:       ; %bb.0: ; %entry
7451; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7452; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7453; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7454; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7455; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7456; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7457; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7458; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7459; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7460; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7461; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7462; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7463; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
7464; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7465; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7466; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7467; GFX12-CU-NEXT:    s_endpgm
7468    ptr %out, i32 %in, i32 %old) {
7469entry:
7470  %gep = getelementptr i32, ptr %out, i32 4
7471  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
7472  %val0 = extractvalue { i32, i1 } %val, 0
7473  store i32 %val0, ptr %out, align 4
7474  ret void
7475}
7476
7477define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
7478; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7479; GFX7:       ; %bb.0: ; %entry
7480; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7481; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7482; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7483; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7484; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7486; GFX7-NEXT:    s_mov_b32 s6, s4
7487; GFX7-NEXT:    s_mov_b32 s7, s5
7488; GFX7-NEXT:    s_mov_b32 s11, s12
7489; GFX7-NEXT:    s_mov_b32 s10, s13
7490; GFX7-NEXT:    s_add_u32 s6, s6, s11
7491; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7492; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7493; GFX7-NEXT:    s_mov_b32 s7, s10
7494; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7495; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7496; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7497; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7498; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7499; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7500; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7501; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7502; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7503; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7504; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7505; GFX7-NEXT:    flat_store_dword v[0:1], v2
7506; GFX7-NEXT:    s_endpgm
7507;
7508; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7509; GFX10-WGP:       ; %bb.0: ; %entry
7510; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7511; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7512; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7513; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7514; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7515; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7516; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7517; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7518; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7519; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7520; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7521; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7522; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7523; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7524; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7525; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7526; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7527; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7528; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7529; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7530; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7531; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7532; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7533; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7534; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7535; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7536; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7537; GFX10-WGP-NEXT:    s_endpgm
7538;
7539; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7540; GFX10-CU:       ; %bb.0: ; %entry
7541; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7542; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7543; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7544; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7545; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7546; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7547; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7548; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7549; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7550; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7551; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7552; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7553; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7554; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7555; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7556; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7557; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7558; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7559; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7560; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7561; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7562; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7563; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7564; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7565; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7566; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7567; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7568; GFX10-CU-NEXT:    s_endpgm
7569;
7570; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7571; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7572; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7573; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7574; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7575; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7576; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7577; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7578; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7579; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7580; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7581; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7582; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7583; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7584; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7585; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7586; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7587; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7588; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7591; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7592; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7593; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7595; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7596; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7597; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7598; SKIP-CACHE-INV-NEXT:    s_endpgm
7599;
7600; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7601; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7602; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7603; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7604; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7605; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7606; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7607; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7608; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7609; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7610; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7611; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7612; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7613; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7614; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7615; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7616; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7617; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7618;
7619; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7620; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7621; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7622; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7623; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7624; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7625; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7626; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7627; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7628; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7629; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7630; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7631; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7632; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7633; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7634; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7635; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7636; GFX90A-TGSPLIT-NEXT:    s_endpgm
7637;
7638; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7639; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7640; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7641; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7642; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7643; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7644; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7645; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7646; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7647; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7648; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7649; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
7650; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7651; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7652; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7653; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7654; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7655; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7656;
7657; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7658; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7659; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7660; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7661; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7662; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7663; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7664; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7665; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7666; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7667; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7668; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
7669; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7670; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7671; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7672; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7673; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7674; GFX940-TGSPLIT-NEXT:    s_endpgm
7675;
7676; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7677; GFX11-WGP:       ; %bb.0: ; %entry
7678; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7679; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7680; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7681; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7682; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7683; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7684; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7685; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7686; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7687; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7688; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7689; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7690; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7691; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7692; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7693; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7694; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7695; GFX11-WGP-NEXT:    s_endpgm
7696;
7697; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7698; GFX11-CU:       ; %bb.0: ; %entry
7699; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7700; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7701; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7702; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7703; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7704; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7705; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7706; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7707; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7708; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7709; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7710; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7711; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7712; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7713; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7714; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7715; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7716; GFX11-CU-NEXT:    s_endpgm
7717;
7718; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7719; GFX12-WGP:       ; %bb.0: ; %entry
7720; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7721; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7722; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7723; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7724; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7725; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7726; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7727; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7728; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7729; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7730; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
7731; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
7732; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
7733; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
7734; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7735; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7736; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7737; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7738; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7739; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7740; GFX12-WGP-NEXT:    s_endpgm
7741;
7742; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
7743; GFX12-CU:       ; %bb.0: ; %entry
7744; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7745; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7746; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7747; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7748; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7749; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7750; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7751; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7752; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7753; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7754; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
7755; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
7756; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
7757; GFX12-CU-NEXT:    s_wait_storecnt 0x0
7758; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7759; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
7760; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7761; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7762; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7763; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7764; GFX12-CU-NEXT:    s_endpgm
7765    ptr %out, i32 %in, i32 %old) {
7766entry:
7767  %gep = getelementptr i32, ptr %out, i32 4
7768  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
7769  %val0 = extractvalue { i32, i1 } %val, 0
7770  store i32 %val0, ptr %out, align 4
7771  ret void
7772}
7773
7774define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
7775; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7776; GFX7:       ; %bb.0: ; %entry
7777; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7778; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7779; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7780; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7781; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7782; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7783; GFX7-NEXT:    s_mov_b32 s6, s4
7784; GFX7-NEXT:    s_mov_b32 s7, s5
7785; GFX7-NEXT:    s_mov_b32 s11, s12
7786; GFX7-NEXT:    s_mov_b32 s10, s13
7787; GFX7-NEXT:    s_add_u32 s6, s6, s11
7788; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7789; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7790; GFX7-NEXT:    s_mov_b32 s7, s10
7791; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7792; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7793; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7794; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7795; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7796; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7797; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7798; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7799; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7800; GFX7-NEXT:    buffer_wbinvl1_vol
7801; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7802; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7803; GFX7-NEXT:    flat_store_dword v[0:1], v2
7804; GFX7-NEXT:    s_endpgm
7805;
7806; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7807; GFX10-WGP:       ; %bb.0: ; %entry
7808; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7809; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7810; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7811; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7812; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7813; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7814; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7815; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7816; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7817; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7818; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7819; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7820; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7821; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7822; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7823; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7824; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7825; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7826; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7827; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7828; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7829; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7830; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7831; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7832; GFX10-WGP-NEXT:    buffer_gl1_inv
7833; GFX10-WGP-NEXT:    buffer_gl0_inv
7834; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7835; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7836; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7837; GFX10-WGP-NEXT:    s_endpgm
7838;
7839; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7840; GFX10-CU:       ; %bb.0: ; %entry
7841; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7842; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7843; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7844; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7845; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7846; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7847; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7848; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7849; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7850; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7851; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7852; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7853; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7854; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7855; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7856; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7857; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7858; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7859; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7860; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7861; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7862; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
7863; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7864; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7865; GFX10-CU-NEXT:    buffer_gl1_inv
7866; GFX10-CU-NEXT:    buffer_gl0_inv
7867; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7868; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7869; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7870; GFX10-CU-NEXT:    s_endpgm
7871;
7872; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7873; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7874; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7875; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7876; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7877; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7878; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7879; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7880; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7881; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7882; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7883; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7884; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7885; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7886; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7887; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7888; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7889; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7890; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7891; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7892; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7893; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7894; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7895; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7896; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7897; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7899; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7900; SKIP-CACHE-INV-NEXT:    s_endpgm
7901;
7902; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7903; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7904; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7905; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7906; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7907; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7908; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7909; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7910; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7911; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7912; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7913; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
7914; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7915; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7916; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7917; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
7918; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
7919; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7920; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7921; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7922;
7923; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7924; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7925; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7926; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7927; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7928; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7929; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7930; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7931; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7932; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7933; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7934; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
7935; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7936; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7937; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7938; GFX90A-TGSPLIT-NEXT:    buffer_invl2
7939; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
7940; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7941; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7942; GFX90A-TGSPLIT-NEXT:    s_endpgm
7943;
7944; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7945; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7946; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7947; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7948; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7949; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7950; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7951; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7952; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7953; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7954; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7955; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
7956; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7957; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7958; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7959; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
7960; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7961; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7962; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7963;
7964; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7965; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7966; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7967; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7968; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7969; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7970; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7971; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7972; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7973; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7974; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7975; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
7976; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7977; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
7978; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7979; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
7980; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7981; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7982; GFX940-TGSPLIT-NEXT:    s_endpgm
7983;
7984; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
7985; GFX11-WGP:       ; %bb.0: ; %entry
7986; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7987; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7988; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7989; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7990; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7991; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7992; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7993; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7994; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7995; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7996; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7997; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
7998; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7999; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8000; GFX11-WGP-NEXT:    buffer_gl1_inv
8001; GFX11-WGP-NEXT:    buffer_gl0_inv
8002; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8003; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8004; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8005; GFX11-WGP-NEXT:    s_endpgm
8006;
8007; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
8008; GFX11-CU:       ; %bb.0: ; %entry
8009; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8010; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8011; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8012; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8013; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8014; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8015; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8016; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8017; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8018; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8019; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8020; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8021; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8022; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8023; GFX11-CU-NEXT:    buffer_gl1_inv
8024; GFX11-CU-NEXT:    buffer_gl0_inv
8025; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8026; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8027; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8028; GFX11-CU-NEXT:    s_endpgm
8029;
8030; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
8031; GFX12-WGP:       ; %bb.0: ; %entry
8032; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8033; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8034; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8035; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8036; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8037; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8038; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8039; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8040; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8041; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8042; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
8043; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8044; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8045; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
8046; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8047; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8048; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8049; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8050; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8051; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
8052; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8053; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8054; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8055; GFX12-WGP-NEXT:    s_endpgm
8056;
8057; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
8058; GFX12-CU:       ; %bb.0: ; %entry
8059; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8060; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8061; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8062; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8063; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8064; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8065; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8066; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8067; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8068; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8069; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
8070; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
8071; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
8072; GFX12-CU-NEXT:    s_wait_storecnt 0x0
8073; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8074; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8075; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
8076; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
8077; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8078; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
8079; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8080; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8081; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8082; GFX12-CU-NEXT:    s_endpgm
8083    ptr %out, i32 %in, i32 %old) {
8084entry:
8085  %gep = getelementptr i32, ptr %out, i32 4
8086  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
8087  %val0 = extractvalue { i32, i1 } %val, 0
8088  store i32 %val0, ptr %out, align 4
8089  ret void
8090}
8091
8092define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
8093; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8094; GFX7:       ; %bb.0: ; %entry
8095; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8096; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8097; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8098; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8099; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8100; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8101; GFX7-NEXT:    s_mov_b32 s6, s4
8102; GFX7-NEXT:    s_mov_b32 s7, s5
8103; GFX7-NEXT:    s_mov_b32 s11, s12
8104; GFX7-NEXT:    s_mov_b32 s10, s13
8105; GFX7-NEXT:    s_add_u32 s6, s6, s11
8106; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8107; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8108; GFX7-NEXT:    s_mov_b32 s7, s10
8109; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8110; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8111; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8112; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8113; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8114; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8115; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8116; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8117; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8118; GFX7-NEXT:    buffer_wbinvl1_vol
8119; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8120; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8121; GFX7-NEXT:    flat_store_dword v[0:1], v2
8122; GFX7-NEXT:    s_endpgm
8123;
8124; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8125; GFX10-WGP:       ; %bb.0: ; %entry
8126; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8127; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8128; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8129; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8130; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8131; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8132; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8133; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8134; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8135; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8136; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8137; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8138; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8139; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8140; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8141; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8142; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8143; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8144; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8145; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8146; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8147; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8148; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8149; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8150; GFX10-WGP-NEXT:    buffer_gl1_inv
8151; GFX10-WGP-NEXT:    buffer_gl0_inv
8152; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8153; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8154; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8155; GFX10-WGP-NEXT:    s_endpgm
8156;
8157; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8158; GFX10-CU:       ; %bb.0: ; %entry
8159; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8160; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8161; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8162; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8163; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8164; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8165; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8166; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8167; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8168; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8169; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8170; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8171; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8172; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8173; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8174; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8175; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8176; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8177; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8178; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8179; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8180; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8181; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8182; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8183; GFX10-CU-NEXT:    buffer_gl1_inv
8184; GFX10-CU-NEXT:    buffer_gl0_inv
8185; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8186; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8187; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8188; GFX10-CU-NEXT:    s_endpgm
8189;
8190; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8191; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8192; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8193; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8194; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8195; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8196; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8197; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8198; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8199; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8200; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8201; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8202; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8203; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8204; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8205; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8206; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8207; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8208; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8210; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8212; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8213; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8214; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8217; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8218; SKIP-CACHE-INV-NEXT:    s_endpgm
8219;
8220; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8221; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8222; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8223; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8224; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8225; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8226; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8227; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8228; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8229; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8230; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8231; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
8232; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8233; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8234; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8235; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8236; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8237; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8238; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8239; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8240;
8241; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8242; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8243; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8244; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8245; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8246; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8247; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8248; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8249; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8250; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8251; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8252; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
8253; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8254; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8255; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8256; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8257; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8259; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8260; GFX90A-TGSPLIT-NEXT:    s_endpgm
8261;
8262; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8263; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8264; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8265; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8266; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8267; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8268; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8269; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8270; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8271; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8272; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8273; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
8274; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8275; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8276; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8277; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
8278; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8279; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8280; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8281;
8282; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8283; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8284; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8285; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8286; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8287; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8288; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8289; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8290; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8291; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8292; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8293; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
8294; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8295; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8296; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8297; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
8298; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8299; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8300; GFX940-TGSPLIT-NEXT:    s_endpgm
8301;
8302; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8303; GFX11-WGP:       ; %bb.0: ; %entry
8304; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8305; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8306; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8307; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8308; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8309; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8310; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8311; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8312; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8313; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8314; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8315; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
8316; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8317; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8318; GFX11-WGP-NEXT:    buffer_gl1_inv
8319; GFX11-WGP-NEXT:    buffer_gl0_inv
8320; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8321; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8322; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8323; GFX11-WGP-NEXT:    s_endpgm
8324;
8325; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8326; GFX11-CU:       ; %bb.0: ; %entry
8327; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8328; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8329; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8330; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8331; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8332; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8333; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8334; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8335; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8336; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8337; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8338; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
8339; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8340; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8341; GFX11-CU-NEXT:    buffer_gl1_inv
8342; GFX11-CU-NEXT:    buffer_gl0_inv
8343; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8344; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8345; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8346; GFX11-CU-NEXT:    s_endpgm
8347;
8348; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8349; GFX12-WGP:       ; %bb.0: ; %entry
8350; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8351; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8352; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8353; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8354; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8355; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8356; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8357; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8358; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8359; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8360; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
8361; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8362; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8363; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
8364; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8365; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8366; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8367; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8368; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8369; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
8370; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8371; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8372; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8373; GFX12-WGP-NEXT:    s_endpgm
8374;
8375; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
8376; GFX12-CU:       ; %bb.0: ; %entry
8377; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8378; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8379; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8380; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8381; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8382; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8383; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8384; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8385; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8386; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8387; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
8388; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
8389; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
8390; GFX12-CU-NEXT:    s_wait_storecnt 0x0
8391; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8392; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8393; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
8394; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
8395; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8396; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
8397; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8398; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8399; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8400; GFX12-CU-NEXT:    s_endpgm
8401    ptr %out, i32 %in, i32 %old) {
8402entry:
8403  %gep = getelementptr i32, ptr %out, i32 4
8404  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
8405  %val0 = extractvalue { i32, i1 } %val, 0
8406  store i32 %val0, ptr %out, align 4
8407  ret void
8408}
8409
8410define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
8411; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8412; GFX7:       ; %bb.0: ; %entry
8413; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8414; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8415; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8416; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8417; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8418; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8419; GFX7-NEXT:    s_mov_b32 s6, s4
8420; GFX7-NEXT:    s_mov_b32 s7, s5
8421; GFX7-NEXT:    s_mov_b32 s11, s12
8422; GFX7-NEXT:    s_mov_b32 s10, s13
8423; GFX7-NEXT:    s_add_u32 s6, s6, s11
8424; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8425; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8426; GFX7-NEXT:    s_mov_b32 s7, s10
8427; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8428; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8429; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8430; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8431; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8432; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8433; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8434; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8435; GFX7-NEXT:    buffer_wbinvl1_vol
8436; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8437; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8438; GFX7-NEXT:    flat_store_dword v[0:1], v2
8439; GFX7-NEXT:    s_endpgm
8440;
8441; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8442; GFX10-WGP:       ; %bb.0: ; %entry
8443; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8444; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8445; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8446; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8447; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8448; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8449; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8450; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8451; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8452; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8453; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8454; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8455; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8456; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8457; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8458; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8459; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8460; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8461; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8462; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8463; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8464; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8465; GFX10-WGP-NEXT:    buffer_gl1_inv
8466; GFX10-WGP-NEXT:    buffer_gl0_inv
8467; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8468; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8469; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8470; GFX10-WGP-NEXT:    s_endpgm
8471;
8472; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8473; GFX10-CU:       ; %bb.0: ; %entry
8474; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8475; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8476; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8477; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8478; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8479; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8480; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8481; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8482; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8483; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8484; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8485; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8486; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8487; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8488; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8489; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8490; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8491; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8492; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8493; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8494; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8495; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8496; GFX10-CU-NEXT:    buffer_gl1_inv
8497; GFX10-CU-NEXT:    buffer_gl0_inv
8498; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8499; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8500; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8501; GFX10-CU-NEXT:    s_endpgm
8502;
8503; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8504; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8505; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8506; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8507; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8508; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8509; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8510; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8511; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8512; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8513; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8514; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8515; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8516; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8517; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8519; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8520; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8521; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8522; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8523; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8524; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8525; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8526; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8529; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8530; SKIP-CACHE-INV-NEXT:    s_endpgm
8531;
8532; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8533; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8534; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8535; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8536; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8537; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8538; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8539; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8540; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8541; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8542; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8543; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8544; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8545; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8546; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8547; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8548; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8549; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8550;
8551; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8552; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8553; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8554; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8555; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8556; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8557; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8558; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8559; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8560; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8561; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8562; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8563; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8564; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8565; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8566; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8567; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8568; GFX90A-TGSPLIT-NEXT:    s_endpgm
8569;
8570; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8571; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8572; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8573; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8574; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8575; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8576; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8577; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8578; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8579; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8580; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8581; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8582; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8583; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
8584; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8585; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8586; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8587;
8588; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8589; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8590; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8591; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8592; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8593; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8594; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8595; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8596; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8597; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8598; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8599; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8600; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8601; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
8602; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8603; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8604; GFX940-TGSPLIT-NEXT:    s_endpgm
8605;
8606; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8607; GFX11-WGP:       ; %bb.0: ; %entry
8608; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8609; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8610; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8611; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8612; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8613; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8614; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8615; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8616; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8617; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8618; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8619; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8620; GFX11-WGP-NEXT:    buffer_gl1_inv
8621; GFX11-WGP-NEXT:    buffer_gl0_inv
8622; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8623; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8624; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8625; GFX11-WGP-NEXT:    s_endpgm
8626;
8627; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8628; GFX11-CU:       ; %bb.0: ; %entry
8629; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8630; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8631; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8632; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8633; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8634; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8635; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8636; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8637; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8638; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8639; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8640; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8641; GFX11-CU-NEXT:    buffer_gl1_inv
8642; GFX11-CU-NEXT:    buffer_gl0_inv
8643; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8644; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8645; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8646; GFX11-CU-NEXT:    s_endpgm
8647;
8648; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8649; GFX12-WGP:       ; %bb.0: ; %entry
8650; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8651; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8652; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8653; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8654; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8655; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8656; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8657; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8658; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8659; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8660; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8661; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
8662; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
8663; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8664; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
8665; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8666; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8667; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8668; GFX12-WGP-NEXT:    s_endpgm
8669;
8670; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
8671; GFX12-CU:       ; %bb.0: ; %entry
8672; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8673; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8674; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8675; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8676; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8677; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8678; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8679; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8680; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8681; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8682; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8683; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
8684; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
8685; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8686; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
8687; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8688; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8689; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8690; GFX12-CU-NEXT:    s_endpgm
8691    ptr %out, i32 %in, i32 %old) {
8692entry:
8693  %gep = getelementptr i32, ptr %out, i32 4
8694  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
8695  %val0 = extractvalue { i32, i1 } %val, 0
8696  store i32 %val0, ptr %out, align 4
8697  ret void
8698}
8699
8700define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
8701; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8702; GFX7:       ; %bb.0: ; %entry
8703; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8704; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8705; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8706; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8707; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8708; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8709; GFX7-NEXT:    s_mov_b32 s6, s4
8710; GFX7-NEXT:    s_mov_b32 s7, s5
8711; GFX7-NEXT:    s_mov_b32 s11, s12
8712; GFX7-NEXT:    s_mov_b32 s10, s13
8713; GFX7-NEXT:    s_add_u32 s6, s6, s11
8714; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8715; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8716; GFX7-NEXT:    s_mov_b32 s7, s10
8717; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8718; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8719; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8720; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8721; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8722; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8723; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8724; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8725; GFX7-NEXT:    buffer_wbinvl1_vol
8726; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8727; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8728; GFX7-NEXT:    flat_store_dword v[0:1], v2
8729; GFX7-NEXT:    s_endpgm
8730;
8731; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8732; GFX10-WGP:       ; %bb.0: ; %entry
8733; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8734; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8735; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8736; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8737; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8738; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8739; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8740; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8741; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8742; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8743; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8744; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8745; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8746; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8747; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8749; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8750; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8751; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8752; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8753; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8754; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8755; GFX10-WGP-NEXT:    buffer_gl1_inv
8756; GFX10-WGP-NEXT:    buffer_gl0_inv
8757; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8758; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8759; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8760; GFX10-WGP-NEXT:    s_endpgm
8761;
8762; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8763; GFX10-CU:       ; %bb.0: ; %entry
8764; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8765; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8766; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8767; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8768; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8769; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8770; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8771; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8772; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8773; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8774; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8775; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8776; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8777; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8778; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8779; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8780; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8781; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8782; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8783; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8784; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8785; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8786; GFX10-CU-NEXT:    buffer_gl1_inv
8787; GFX10-CU-NEXT:    buffer_gl0_inv
8788; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8789; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8790; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8791; GFX10-CU-NEXT:    s_endpgm
8792;
8793; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8794; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8795; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8796; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8797; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8798; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8799; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8800; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8801; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8802; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8803; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8804; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8805; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8806; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8807; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8808; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8809; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8810; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8811; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8814; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8815; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8816; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8817; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8818; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8819; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8820; SKIP-CACHE-INV-NEXT:    s_endpgm
8821;
8822; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8823; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8824; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8825; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8826; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8827; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8828; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8829; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8830; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8831; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8832; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8833; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8834; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8835; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
8836; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
8837; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8838; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8839; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8840;
8841; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8842; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8843; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8844; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8845; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8846; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8847; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8848; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8849; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8850; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8851; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8852; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8853; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8854; GFX90A-TGSPLIT-NEXT:    buffer_invl2
8855; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
8856; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8857; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8858; GFX90A-TGSPLIT-NEXT:    s_endpgm
8859;
8860; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8861; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8862; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8863; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8864; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8865; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8866; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8867; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8868; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8869; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8870; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8871; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8872; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8873; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
8874; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8875; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8876; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8877;
8878; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8879; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8880; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8881; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8882; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8883; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8884; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8885; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8886; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8887; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8888; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8889; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
8890; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8891; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
8892; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8893; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8894; GFX940-TGSPLIT-NEXT:    s_endpgm
8895;
8896; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8897; GFX11-WGP:       ; %bb.0: ; %entry
8898; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8899; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8900; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8901; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8902; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8903; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8904; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8905; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8906; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8907; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8908; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8909; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8910; GFX11-WGP-NEXT:    buffer_gl1_inv
8911; GFX11-WGP-NEXT:    buffer_gl0_inv
8912; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8913; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8914; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8915; GFX11-WGP-NEXT:    s_endpgm
8916;
8917; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8918; GFX11-CU:       ; %bb.0: ; %entry
8919; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8920; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8921; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8922; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8923; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8924; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8925; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8926; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8927; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8928; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8929; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8930; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8931; GFX11-CU-NEXT:    buffer_gl1_inv
8932; GFX11-CU-NEXT:    buffer_gl0_inv
8933; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8934; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8935; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8936; GFX11-CU-NEXT:    s_endpgm
8937;
8938; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8939; GFX12-WGP:       ; %bb.0: ; %entry
8940; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8941; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8942; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8943; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8944; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8945; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8946; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8947; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8948; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8949; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8950; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8951; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8952; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
8953; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8954; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8955; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8956; GFX12-WGP-NEXT:    s_endpgm
8957;
8958; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
8959; GFX12-CU:       ; %bb.0: ; %entry
8960; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8961; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8962; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8963; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8964; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8965; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8966; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8967; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8968; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8969; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8970; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8971; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8972; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
8973; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8974; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8975; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8976; GFX12-CU-NEXT:    s_endpgm
8977    ptr %out, i32 %in, i32 %old) {
8978entry:
8979  %gep = getelementptr i32, ptr %out, i32 4
8980  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
8981  %val0 = extractvalue { i32, i1 } %val, 0
8982  store i32 %val0, ptr %out, align 4
8983  ret void
8984}
8985
8986define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
8987; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
8988; GFX7:       ; %bb.0: ; %entry
8989; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8990; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8991; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8992; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8993; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8994; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8995; GFX7-NEXT:    s_mov_b32 s6, s4
8996; GFX7-NEXT:    s_mov_b32 s7, s5
8997; GFX7-NEXT:    s_mov_b32 s11, s12
8998; GFX7-NEXT:    s_mov_b32 s10, s13
8999; GFX7-NEXT:    s_add_u32 s6, s6, s11
9000; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9001; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9002; GFX7-NEXT:    s_mov_b32 s7, s10
9003; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9004; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9005; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9006; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9007; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9008; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9009; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9010; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9011; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9012; GFX7-NEXT:    buffer_wbinvl1_vol
9013; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9014; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9015; GFX7-NEXT:    flat_store_dword v[0:1], v2
9016; GFX7-NEXT:    s_endpgm
9017;
9018; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
9019; GFX10-WGP:       ; %bb.0: ; %entry
9020; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9021; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9022; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9023; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9024; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9025; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9026; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9027; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9028; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9029; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9030; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9031; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9032; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9033; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9034; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9035; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9036; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9037; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9038; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9039; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9040; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9041; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9042; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9043; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9044; GFX10-WGP-NEXT:    buffer_gl1_inv
9045; GFX10-WGP-NEXT:    buffer_gl0_inv
9046; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9047; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9048; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9049; GFX10-WGP-NEXT:    s_endpgm
9050;
9051; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
9052; GFX10-CU:       ; %bb.0: ; %entry
9053; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9054; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9055; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9056; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9057; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9058; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9059; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9060; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9061; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9062; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9063; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9064; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9065; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9066; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9067; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9068; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9069; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9070; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9071; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9072; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9073; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9074; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9075; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9076; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9077; GFX10-CU-NEXT:    buffer_gl1_inv
9078; GFX10-CU-NEXT:    buffer_gl0_inv
9079; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9080; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9081; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9082; GFX10-CU-NEXT:    s_endpgm
9083;
9084; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg:
9085; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9086; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9087; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9088; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9089; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9090; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9091; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9092; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9093; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9094; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9095; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9096; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9097; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9098; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9099; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9100; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9101; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9102; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9103; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9104; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9105; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9106; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9107; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9108; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9109; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9111; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9112; SKIP-CACHE-INV-NEXT:    s_endpgm
9113;
9114; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
9115; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9116; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9117; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9118; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9119; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9120; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9121; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9122; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9123; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9124; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9125; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9126; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9127; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9128; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9129; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9130; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9131; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9132; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9133; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9134;
9135; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
9136; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9137; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9138; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9139; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9140; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9141; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9142; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9143; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9144; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9145; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9146; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9147; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9148; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9149; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9150; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9151; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9152; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9153; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9154; GFX90A-TGSPLIT-NEXT:    s_endpgm
9155;
9156; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
9157; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9158; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9159; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9160; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9161; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9162; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9163; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9164; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9165; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9166; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9167; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9168; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9169; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9170; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9171; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
9172; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9173; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9174; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9175;
9176; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
9177; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9178; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9179; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9180; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9181; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9182; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9183; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9184; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9185; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9186; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9187; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9188; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9189; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9190; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9191; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
9192; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9193; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9194; GFX940-TGSPLIT-NEXT:    s_endpgm
9195;
9196; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
9197; GFX11-WGP:       ; %bb.0: ; %entry
9198; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9199; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9200; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9201; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9202; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9203; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9204; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9205; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9206; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9207; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9208; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9209; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9210; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9211; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9212; GFX11-WGP-NEXT:    buffer_gl1_inv
9213; GFX11-WGP-NEXT:    buffer_gl0_inv
9214; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9215; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9216; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9217; GFX11-WGP-NEXT:    s_endpgm
9218;
9219; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
9220; GFX11-CU:       ; %bb.0: ; %entry
9221; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9222; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9223; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9224; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9225; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9226; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9227; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9228; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9229; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9230; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9231; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9232; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9233; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9234; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9235; GFX11-CU-NEXT:    buffer_gl1_inv
9236; GFX11-CU-NEXT:    buffer_gl0_inv
9237; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9238; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9239; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9240; GFX11-CU-NEXT:    s_endpgm
9241;
9242; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
9243; GFX12-WGP:       ; %bb.0: ; %entry
9244; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9245; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9246; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9247; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9248; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9249; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9250; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9251; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9252; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9253; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9254; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
9255; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9256; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9257; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9258; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9259; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9260; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9261; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9262; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9263; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
9264; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9265; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9266; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9267; GFX12-WGP-NEXT:    s_endpgm
9268;
9269; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
9270; GFX12-CU:       ; %bb.0: ; %entry
9271; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9272; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9273; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9274; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9275; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9276; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9277; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9278; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9279; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9280; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9281; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
9282; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9283; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9284; GFX12-CU-NEXT:    s_wait_storecnt 0x0
9285; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9286; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9287; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9288; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9289; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9290; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
9291; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9292; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9293; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9294; GFX12-CU-NEXT:    s_endpgm
9295    ptr %out, i32 %in, i32 %old) {
9296entry:
9297  %gep = getelementptr i32, ptr %out, i32 4
9298  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
9299  %val0 = extractvalue { i32, i1 } %val, 0
9300  store i32 %val0, ptr %out, align 4
9301  ret void
9302}
9303
9304define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
9305; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9306; GFX7:       ; %bb.0: ; %entry
9307; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9308; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9309; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9310; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9311; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9312; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9313; GFX7-NEXT:    s_mov_b32 s6, s4
9314; GFX7-NEXT:    s_mov_b32 s7, s5
9315; GFX7-NEXT:    s_mov_b32 s11, s12
9316; GFX7-NEXT:    s_mov_b32 s10, s13
9317; GFX7-NEXT:    s_add_u32 s6, s6, s11
9318; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9319; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9320; GFX7-NEXT:    s_mov_b32 s7, s10
9321; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9322; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9323; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9324; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9325; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9326; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9327; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9328; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9329; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9330; GFX7-NEXT:    buffer_wbinvl1_vol
9331; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9332; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9333; GFX7-NEXT:    flat_store_dword v[0:1], v2
9334; GFX7-NEXT:    s_endpgm
9335;
9336; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9337; GFX10-WGP:       ; %bb.0: ; %entry
9338; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9339; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9340; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9341; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9342; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9343; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9344; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9345; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9346; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9347; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9348; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9349; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9350; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9351; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9352; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9353; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9354; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9355; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9356; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9357; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9358; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9359; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9360; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9361; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9362; GFX10-WGP-NEXT:    buffer_gl1_inv
9363; GFX10-WGP-NEXT:    buffer_gl0_inv
9364; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9365; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9366; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9367; GFX10-WGP-NEXT:    s_endpgm
9368;
9369; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9370; GFX10-CU:       ; %bb.0: ; %entry
9371; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9372; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9373; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9374; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9375; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9376; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9377; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9378; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9379; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9380; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9381; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9382; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9383; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9384; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9385; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9387; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9388; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9389; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9390; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9391; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9392; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9393; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9394; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9395; GFX10-CU-NEXT:    buffer_gl1_inv
9396; GFX10-CU-NEXT:    buffer_gl0_inv
9397; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9398; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9399; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9400; GFX10-CU-NEXT:    s_endpgm
9401;
9402; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9403; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9404; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9405; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9406; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9407; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9408; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9409; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9410; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9411; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9412; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9413; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9414; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9415; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9416; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9417; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9419; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9420; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9421; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9422; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9423; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9424; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9425; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9426; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9427; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9428; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9429; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9430; SKIP-CACHE-INV-NEXT:    s_endpgm
9431;
9432; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9433; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9434; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9435; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9436; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9437; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9438; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9439; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9440; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9441; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9442; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9443; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9444; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9445; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9446; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9447; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9448; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9449; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9450; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9451; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9452;
9453; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9454; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9455; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9456; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9457; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9458; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9459; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9460; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9461; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9462; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9463; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9464; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9465; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9466; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9467; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9468; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9469; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9470; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9471; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9472; GFX90A-TGSPLIT-NEXT:    s_endpgm
9473;
9474; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9475; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9476; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9477; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9478; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9479; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9480; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9481; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9482; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9483; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9484; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9485; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9486; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9487; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9488; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9489; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
9490; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9491; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9492; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9493;
9494; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9495; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9496; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9497; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9498; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9499; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9500; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9501; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9502; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9503; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9504; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9505; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9506; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9507; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9508; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9509; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
9510; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9511; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9512; GFX940-TGSPLIT-NEXT:    s_endpgm
9513;
9514; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9515; GFX11-WGP:       ; %bb.0: ; %entry
9516; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9517; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9518; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9519; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9520; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9521; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9522; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9523; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9524; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9525; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9526; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9527; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9528; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9529; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9530; GFX11-WGP-NEXT:    buffer_gl1_inv
9531; GFX11-WGP-NEXT:    buffer_gl0_inv
9532; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9533; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9534; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9535; GFX11-WGP-NEXT:    s_endpgm
9536;
9537; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9538; GFX11-CU:       ; %bb.0: ; %entry
9539; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9540; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9541; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9542; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9543; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9544; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9545; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9546; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9547; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9548; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9549; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9550; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9551; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9552; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9553; GFX11-CU-NEXT:    buffer_gl1_inv
9554; GFX11-CU-NEXT:    buffer_gl0_inv
9555; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9556; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9557; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9558; GFX11-CU-NEXT:    s_endpgm
9559;
9560; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9561; GFX12-WGP:       ; %bb.0: ; %entry
9562; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9563; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9564; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9565; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9566; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9567; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9568; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9569; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9570; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9571; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9572; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
9573; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9574; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9575; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9576; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9577; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9578; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9579; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9580; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9581; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
9582; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9583; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9584; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9585; GFX12-WGP-NEXT:    s_endpgm
9586;
9587; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
9588; GFX12-CU:       ; %bb.0: ; %entry
9589; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9590; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9591; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9592; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9593; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9594; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9595; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9596; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9597; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9598; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9599; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
9600; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9601; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9602; GFX12-CU-NEXT:    s_wait_storecnt 0x0
9603; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9604; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9605; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9606; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9607; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9608; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
9609; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9610; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9611; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9612; GFX12-CU-NEXT:    s_endpgm
9613    ptr %out, i32 %in, i32 %old) {
9614entry:
9615  %gep = getelementptr i32, ptr %out, i32 4
9616  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
9617  %val0 = extractvalue { i32, i1 } %val, 0
9618  store i32 %val0, ptr %out, align 4
9619  ret void
9620}
9621
9622define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
9623; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9624; GFX7:       ; %bb.0: ; %entry
9625; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9626; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9627; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9628; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9629; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9630; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9631; GFX7-NEXT:    s_mov_b32 s6, s4
9632; GFX7-NEXT:    s_mov_b32 s7, s5
9633; GFX7-NEXT:    s_mov_b32 s11, s12
9634; GFX7-NEXT:    s_mov_b32 s10, s13
9635; GFX7-NEXT:    s_add_u32 s6, s6, s11
9636; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9637; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9638; GFX7-NEXT:    s_mov_b32 s7, s10
9639; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9640; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9641; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9642; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9643; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9644; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9645; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9646; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9647; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9648; GFX7-NEXT:    buffer_wbinvl1_vol
9649; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9650; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9651; GFX7-NEXT:    flat_store_dword v[0:1], v2
9652; GFX7-NEXT:    s_endpgm
9653;
9654; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9655; GFX10-WGP:       ; %bb.0: ; %entry
9656; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9657; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9658; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9659; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9660; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9661; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9662; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9663; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9664; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9665; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9666; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9667; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9668; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9669; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9670; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9671; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9672; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9673; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9674; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9675; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9676; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9677; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9678; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9679; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9680; GFX10-WGP-NEXT:    buffer_gl1_inv
9681; GFX10-WGP-NEXT:    buffer_gl0_inv
9682; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9683; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9684; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9685; GFX10-WGP-NEXT:    s_endpgm
9686;
9687; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9688; GFX10-CU:       ; %bb.0: ; %entry
9689; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9690; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9691; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9692; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9693; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9694; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9695; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9696; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9697; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9698; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9699; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9700; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9701; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9702; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9703; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9704; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9705; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9706; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9707; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9708; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9709; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9710; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9711; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9712; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9713; GFX10-CU-NEXT:    buffer_gl1_inv
9714; GFX10-CU-NEXT:    buffer_gl0_inv
9715; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9716; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9717; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9718; GFX10-CU-NEXT:    s_endpgm
9719;
9720; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9721; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9722; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9723; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9724; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9725; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9726; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9727; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9728; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9729; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9730; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9731; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9732; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9733; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9734; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9735; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9737; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9738; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9739; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9740; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9741; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9742; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9743; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9744; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9746; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9747; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9748; SKIP-CACHE-INV-NEXT:    s_endpgm
9749;
9750; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9751; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9752; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9753; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9754; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9755; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9756; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9757; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9758; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9759; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9760; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9761; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
9762; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9763; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9764; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9765; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
9766; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
9767; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9768; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9769; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9770;
9771; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9772; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9773; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9774; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9775; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9776; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9777; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9778; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9779; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9780; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9781; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9782; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
9783; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9784; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9785; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9786; GFX90A-TGSPLIT-NEXT:    buffer_invl2
9787; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
9788; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9789; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9790; GFX90A-TGSPLIT-NEXT:    s_endpgm
9791;
9792; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9793; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9794; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9795; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9796; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9797; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9798; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9799; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9800; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9802; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9803; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9804; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9805; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9806; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9807; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
9808; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9809; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9810; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9811;
9812; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9813; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9814; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9815; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9816; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9817; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9820; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9821; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9822; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9823; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
9824; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9825; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
9826; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9827; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
9828; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9829; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9830; GFX940-TGSPLIT-NEXT:    s_endpgm
9831;
9832; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9833; GFX11-WGP:       ; %bb.0: ; %entry
9834; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9835; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9836; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9837; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9838; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9839; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9840; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9841; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9842; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9843; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9844; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9845; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9846; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9847; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9848; GFX11-WGP-NEXT:    buffer_gl1_inv
9849; GFX11-WGP-NEXT:    buffer_gl0_inv
9850; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9851; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9852; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9853; GFX11-WGP-NEXT:    s_endpgm
9854;
9855; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9856; GFX11-CU:       ; %bb.0: ; %entry
9857; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9858; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9859; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9860; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9861; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9862; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9863; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9864; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9865; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9866; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9867; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9868; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
9869; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9870; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9871; GFX11-CU-NEXT:    buffer_gl1_inv
9872; GFX11-CU-NEXT:    buffer_gl0_inv
9873; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9874; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9875; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9876; GFX11-CU-NEXT:    s_endpgm
9877;
9878; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9879; GFX12-WGP:       ; %bb.0: ; %entry
9880; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9881; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9882; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9883; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9884; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9885; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9886; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9887; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9888; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9889; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9890; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
9891; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9892; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9893; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
9894; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9895; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9896; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
9897; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
9898; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9899; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
9900; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9901; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9902; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9903; GFX12-WGP-NEXT:    s_endpgm
9904;
9905; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
9906; GFX12-CU:       ; %bb.0: ; %entry
9907; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9908; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9909; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9910; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9911; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9912; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9913; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9914; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9915; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9916; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9917; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
9918; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9919; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9920; GFX12-CU-NEXT:    s_wait_storecnt 0x0
9921; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9922; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
9923; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
9924; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
9925; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9926; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
9927; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9928; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9929; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9930; GFX12-CU-NEXT:    s_endpgm
9931    ptr %out, i32 %in, i32 %old) {
9932entry:
9933  %gep = getelementptr i32, ptr %out, i32 4
9934  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
9935  %val0 = extractvalue { i32, i1 } %val, 0
9936  store i32 %val0, ptr %out, align 4
9937  ret void
9938}
9939
9940define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
9941; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
9942; GFX7:       ; %bb.0: ; %entry
9943; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9944; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9945; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9946; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9947; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9948; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9949; GFX7-NEXT:    s_mov_b32 s6, s4
9950; GFX7-NEXT:    s_mov_b32 s7, s5
9951; GFX7-NEXT:    s_mov_b32 s11, s12
9952; GFX7-NEXT:    s_mov_b32 s10, s13
9953; GFX7-NEXT:    s_add_u32 s6, s6, s11
9954; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9955; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9956; GFX7-NEXT:    s_mov_b32 s7, s10
9957; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9958; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9959; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9960; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9961; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9962; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9963; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9964; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9965; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9966; GFX7-NEXT:    buffer_wbinvl1_vol
9967; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9968; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9969; GFX7-NEXT:    flat_store_dword v[0:1], v2
9970; GFX7-NEXT:    s_endpgm
9971;
9972; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
9973; GFX10-WGP:       ; %bb.0: ; %entry
9974; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9975; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9976; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9977; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9978; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9979; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9980; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9981; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9982; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9983; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9984; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9985; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9986; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9987; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9988; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9989; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9990; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9991; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9992; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9993; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9994; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9995; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
9996; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9997; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9998; GFX10-WGP-NEXT:    buffer_gl1_inv
9999; GFX10-WGP-NEXT:    buffer_gl0_inv
10000; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10001; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10002; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10003; GFX10-WGP-NEXT:    s_endpgm
10004;
10005; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10006; GFX10-CU:       ; %bb.0: ; %entry
10007; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
10008; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10009; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
10010; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
10011; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
10012; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10013; GFX10-CU-NEXT:    s_mov_b32 s6, s4
10014; GFX10-CU-NEXT:    s_mov_b32 s7, s5
10015; GFX10-CU-NEXT:    s_mov_b32 s11, s12
10016; GFX10-CU-NEXT:    s_mov_b32 s10, s13
10017; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
10018; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
10019; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10020; GFX10-CU-NEXT:    s_mov_b32 s7, s10
10021; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
10022; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
10023; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10024; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
10025; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10026; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10027; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10028; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10029; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10030; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10031; GFX10-CU-NEXT:    buffer_gl1_inv
10032; GFX10-CU-NEXT:    buffer_gl0_inv
10033; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10034; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10035; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10036; GFX10-CU-NEXT:    s_endpgm
10037;
10038; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10039; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10040; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
10041; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
10042; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
10043; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
10044; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
10045; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10046; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
10047; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
10048; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10049; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
10050; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
10051; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
10052; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
10053; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
10054; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
10055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10056; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
10058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10059; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10060; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10061; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10062; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10064; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10065; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10066; SKIP-CACHE-INV-NEXT:    s_endpgm
10067;
10068; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10069; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10070; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10071; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10072; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10073; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10074; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10075; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10076; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10077; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10078; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10079; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
10080; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10081; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10083; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
10084; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
10085; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10086; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10087; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10088;
10089; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10090; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10091; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10092; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10093; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10094; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10095; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10097; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10098; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10099; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10100; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
10101; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10102; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10103; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10104; GFX90A-TGSPLIT-NEXT:    buffer_invl2
10105; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
10106; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10107; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10108; GFX90A-TGSPLIT-NEXT:    s_endpgm
10109;
10110; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10111; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10112; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10113; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10114; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10115; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10116; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10117; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10118; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10119; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10120; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10121; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10122; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10123; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10124; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10125; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
10126; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10127; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10128; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10129;
10130; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10131; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10132; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10133; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10134; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10135; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10136; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10137; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10138; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10139; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10140; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10141; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10142; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10143; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10144; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10145; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
10146; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10147; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10148; GFX940-TGSPLIT-NEXT:    s_endpgm
10149;
10150; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10151; GFX11-WGP:       ; %bb.0: ; %entry
10152; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10153; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10154; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10155; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10156; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
10157; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10158; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10159; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
10160; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10161; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10162; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10163; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10164; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10165; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10166; GFX11-WGP-NEXT:    buffer_gl1_inv
10167; GFX11-WGP-NEXT:    buffer_gl0_inv
10168; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10169; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10170; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10171; GFX11-WGP-NEXT:    s_endpgm
10172;
10173; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10174; GFX11-CU:       ; %bb.0: ; %entry
10175; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10176; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10177; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10178; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10179; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
10180; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10181; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10182; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
10183; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10184; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10185; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10186; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10187; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10188; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10189; GFX11-CU-NEXT:    buffer_gl1_inv
10190; GFX11-CU-NEXT:    buffer_gl0_inv
10191; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10192; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10193; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10194; GFX11-CU-NEXT:    s_endpgm
10195;
10196; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10197; GFX12-WGP:       ; %bb.0: ; %entry
10198; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10199; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10200; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10201; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10202; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
10203; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10204; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10205; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
10206; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10207; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10208; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
10209; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10210; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10211; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
10212; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10213; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10214; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10215; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10216; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10217; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
10218; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10219; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10220; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10221; GFX12-WGP-NEXT:    s_endpgm
10222;
10223; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
10224; GFX12-CU:       ; %bb.0: ; %entry
10225; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10226; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10227; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10228; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10229; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
10230; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10231; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10232; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
10233; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10234; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10235; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
10236; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
10237; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
10238; GFX12-CU-NEXT:    s_wait_storecnt 0x0
10239; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10240; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10241; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
10242; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
10243; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10244; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
10245; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10246; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10247; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10248; GFX12-CU-NEXT:    s_endpgm
10249    ptr %out, i32 %in, i32 %old) {
10250entry:
10251  %gep = getelementptr i32, ptr %out, i32 4
10252  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
10253  %val0 = extractvalue { i32, i1 } %val, 0
10254  store i32 %val0, ptr %out, align 4
10255  ret void
10256}
10257
10258define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
10259; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10260; GFX7:       ; %bb.0: ; %entry
10261; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
10262; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10263; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
10264; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
10265; GFX7-NEXT:    s_mov_b64 s[12:13], 16
10266; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10267; GFX7-NEXT:    s_mov_b32 s6, s4
10268; GFX7-NEXT:    s_mov_b32 s7, s5
10269; GFX7-NEXT:    s_mov_b32 s11, s12
10270; GFX7-NEXT:    s_mov_b32 s10, s13
10271; GFX7-NEXT:    s_add_u32 s6, s6, s11
10272; GFX7-NEXT:    s_addc_u32 s10, s7, s10
10273; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10274; GFX7-NEXT:    s_mov_b32 s7, s10
10275; GFX7-NEXT:    v_mov_b32_e32 v2, s9
10276; GFX7-NEXT:    v_mov_b32_e32 v0, s8
10277; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10278; GFX7-NEXT:    v_mov_b32_e32 v3, v0
10279; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10280; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10281; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10282; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10283; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10284; GFX7-NEXT:    buffer_wbinvl1_vol
10285; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10286; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10287; GFX7-NEXT:    flat_store_dword v[0:1], v2
10288; GFX7-NEXT:    s_endpgm
10289;
10290; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10291; GFX10-WGP:       ; %bb.0: ; %entry
10292; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
10293; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10294; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
10295; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
10296; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
10297; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10298; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
10299; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
10300; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
10301; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
10302; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
10303; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
10304; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10305; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
10306; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
10307; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
10308; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10309; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
10310; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10311; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10312; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10313; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10314; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10315; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10316; GFX10-WGP-NEXT:    buffer_gl1_inv
10317; GFX10-WGP-NEXT:    buffer_gl0_inv
10318; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10319; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10320; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10321; GFX10-WGP-NEXT:    s_endpgm
10322;
10323; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10324; GFX10-CU:       ; %bb.0: ; %entry
10325; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
10326; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10327; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
10328; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
10329; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
10330; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10331; GFX10-CU-NEXT:    s_mov_b32 s6, s4
10332; GFX10-CU-NEXT:    s_mov_b32 s7, s5
10333; GFX10-CU-NEXT:    s_mov_b32 s11, s12
10334; GFX10-CU-NEXT:    s_mov_b32 s10, s13
10335; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
10336; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
10337; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10338; GFX10-CU-NEXT:    s_mov_b32 s7, s10
10339; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
10340; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
10341; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10342; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
10343; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10344; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10345; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10346; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10347; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10348; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10349; GFX10-CU-NEXT:    buffer_gl1_inv
10350; GFX10-CU-NEXT:    buffer_gl0_inv
10351; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10352; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10353; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10354; GFX10-CU-NEXT:    s_endpgm
10355;
10356; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10357; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10358; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
10359; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
10360; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
10361; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
10362; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
10363; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10364; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
10365; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
10366; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
10368; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
10369; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
10370; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
10371; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
10372; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
10373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10374; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
10376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10378; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10379; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10380; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10381; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10382; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10383; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10384; SKIP-CACHE-INV-NEXT:    s_endpgm
10385;
10386; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10387; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10388; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10389; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10390; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10391; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10392; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10393; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10394; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10395; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10396; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10397; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
10398; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10399; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10400; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10401; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
10402; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
10403; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10404; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10405; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10406;
10407; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10408; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10409; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10410; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10411; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10412; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10413; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10414; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10415; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10416; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10417; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10418; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
10419; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10420; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10421; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10422; GFX90A-TGSPLIT-NEXT:    buffer_invl2
10423; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
10424; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10425; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10426; GFX90A-TGSPLIT-NEXT:    s_endpgm
10427;
10428; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10429; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10430; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10431; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10432; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10433; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10434; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10435; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10436; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10437; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10438; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10439; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10440; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10441; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10442; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10443; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
10444; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10445; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10446; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10447;
10448; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10449; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10450; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10451; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10452; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10453; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10454; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10455; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10456; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10457; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10458; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10459; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10460; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10461; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10462; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10463; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
10464; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10465; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10466; GFX940-TGSPLIT-NEXT:    s_endpgm
10467;
10468; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10469; GFX11-WGP:       ; %bb.0: ; %entry
10470; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10471; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10472; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10473; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10474; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
10475; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10476; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10477; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
10478; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10479; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10480; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10481; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10482; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10483; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10484; GFX11-WGP-NEXT:    buffer_gl1_inv
10485; GFX11-WGP-NEXT:    buffer_gl0_inv
10486; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10487; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10488; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10489; GFX11-WGP-NEXT:    s_endpgm
10490;
10491; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10492; GFX11-CU:       ; %bb.0: ; %entry
10493; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10494; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10495; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10496; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10497; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
10498; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10499; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10500; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
10501; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10502; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10503; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10504; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10505; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10506; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10507; GFX11-CU-NEXT:    buffer_gl1_inv
10508; GFX11-CU-NEXT:    buffer_gl0_inv
10509; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10510; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10511; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10512; GFX11-CU-NEXT:    s_endpgm
10513;
10514; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10515; GFX12-WGP:       ; %bb.0: ; %entry
10516; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10517; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10518; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10519; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10520; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
10521; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10522; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10523; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
10524; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10525; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10526; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
10527; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10528; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10529; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
10530; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10531; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10532; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10533; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
10534; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10535; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10536; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10537; GFX12-WGP-NEXT:    s_endpgm
10538;
10539; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
10540; GFX12-CU:       ; %bb.0: ; %entry
10541; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10542; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10543; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10544; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10545; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
10546; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10547; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10548; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
10549; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10550; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10551; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
10552; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
10553; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
10554; GFX12-CU-NEXT:    s_wait_storecnt 0x0
10555; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10556; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10557; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10558; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
10559; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10560; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10561; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10562; GFX12-CU-NEXT:    s_endpgm
10563    ptr %out, i32 %in, i32 %old) {
10564entry:
10565  %gep = getelementptr i32, ptr %out, i32 4
10566  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
10567  %val0 = extractvalue { i32, i1 } %val, 0
10568  store i32 %val0, ptr %out, align 4
10569  ret void
10570}
10571
10572define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
10573; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10574; GFX7:       ; %bb.0: ; %entry
10575; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
10576; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10577; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
10578; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
10579; GFX7-NEXT:    s_mov_b64 s[12:13], 16
10580; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10581; GFX7-NEXT:    s_mov_b32 s6, s4
10582; GFX7-NEXT:    s_mov_b32 s7, s5
10583; GFX7-NEXT:    s_mov_b32 s11, s12
10584; GFX7-NEXT:    s_mov_b32 s10, s13
10585; GFX7-NEXT:    s_add_u32 s6, s6, s11
10586; GFX7-NEXT:    s_addc_u32 s10, s7, s10
10587; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10588; GFX7-NEXT:    s_mov_b32 s7, s10
10589; GFX7-NEXT:    v_mov_b32_e32 v2, s9
10590; GFX7-NEXT:    v_mov_b32_e32 v0, s8
10591; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10592; GFX7-NEXT:    v_mov_b32_e32 v3, v0
10593; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10594; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10595; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10596; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10597; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10598; GFX7-NEXT:    buffer_wbinvl1_vol
10599; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10600; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10601; GFX7-NEXT:    flat_store_dword v[0:1], v2
10602; GFX7-NEXT:    s_endpgm
10603;
10604; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10605; GFX10-WGP:       ; %bb.0: ; %entry
10606; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
10607; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10608; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
10609; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
10610; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
10611; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10612; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
10613; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
10614; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
10615; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
10616; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
10617; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
10618; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10619; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
10620; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
10621; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
10622; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10623; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
10624; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10625; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10626; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10627; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10628; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10629; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10630; GFX10-WGP-NEXT:    buffer_gl1_inv
10631; GFX10-WGP-NEXT:    buffer_gl0_inv
10632; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10633; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10634; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10635; GFX10-WGP-NEXT:    s_endpgm
10636;
10637; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10638; GFX10-CU:       ; %bb.0: ; %entry
10639; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
10640; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10641; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
10642; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
10643; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
10644; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10645; GFX10-CU-NEXT:    s_mov_b32 s6, s4
10646; GFX10-CU-NEXT:    s_mov_b32 s7, s5
10647; GFX10-CU-NEXT:    s_mov_b32 s11, s12
10648; GFX10-CU-NEXT:    s_mov_b32 s10, s13
10649; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
10650; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
10651; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10652; GFX10-CU-NEXT:    s_mov_b32 s7, s10
10653; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
10654; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
10655; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10656; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
10657; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10658; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10659; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10660; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10661; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10662; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10663; GFX10-CU-NEXT:    buffer_gl1_inv
10664; GFX10-CU-NEXT:    buffer_gl0_inv
10665; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10666; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10667; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10668; GFX10-CU-NEXT:    s_endpgm
10669;
10670; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10671; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10672; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
10673; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
10674; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
10675; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
10676; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
10677; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10678; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
10679; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
10680; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10681; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
10682; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
10683; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
10684; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
10685; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
10686; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
10687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10688; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
10690; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10691; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10692; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10693; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10694; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10696; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10697; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10698; SKIP-CACHE-INV-NEXT:    s_endpgm
10699;
10700; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10701; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10702; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10703; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10704; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10705; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10706; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10707; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10708; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10709; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10710; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10711; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
10712; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10713; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10714; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10715; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
10716; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
10717; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10718; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10719; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10720;
10721; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10722; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10723; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10724; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
10725; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
10726; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10727; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
10728; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
10729; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10730; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10731; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10732; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
10733; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10734; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
10735; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10736; GFX90A-TGSPLIT-NEXT:    buffer_invl2
10737; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
10738; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10739; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10740; GFX90A-TGSPLIT-NEXT:    s_endpgm
10741;
10742; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10743; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10744; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10745; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10746; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10747; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10748; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10749; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10750; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10751; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10752; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10753; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10754; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10755; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10756; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10757; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
10758; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10759; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10760; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10761;
10762; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10763; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10764; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10765; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
10766; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
10767; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10768; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
10769; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
10770; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10771; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
10772; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10773; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
10774; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10775; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
10776; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10777; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
10778; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10779; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10780; GFX940-TGSPLIT-NEXT:    s_endpgm
10781;
10782; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10783; GFX11-WGP:       ; %bb.0: ; %entry
10784; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10785; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10786; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10787; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10788; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
10789; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10790; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10791; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
10792; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10793; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10794; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10795; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10796; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10797; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10798; GFX11-WGP-NEXT:    buffer_gl1_inv
10799; GFX11-WGP-NEXT:    buffer_gl0_inv
10800; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10801; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10802; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10803; GFX11-WGP-NEXT:    s_endpgm
10804;
10805; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10806; GFX11-CU:       ; %bb.0: ; %entry
10807; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10808; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10809; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10810; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10811; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
10812; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10813; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10814; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
10815; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10816; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10817; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10818; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10819; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
10820; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10821; GFX11-CU-NEXT:    buffer_gl1_inv
10822; GFX11-CU-NEXT:    buffer_gl0_inv
10823; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10824; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10825; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10826; GFX11-CU-NEXT:    s_endpgm
10827;
10828; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10829; GFX12-WGP:       ; %bb.0: ; %entry
10830; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10831; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
10832; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
10833; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10834; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
10835; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10836; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10837; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
10838; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10839; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10840; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
10841; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10842; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10843; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
10844; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10845; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10846; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
10847; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
10848; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10849; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
10850; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10851; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10852; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10853; GFX12-WGP-NEXT:    s_endpgm
10854;
10855; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
10856; GFX12-CU:       ; %bb.0: ; %entry
10857; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
10858; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
10859; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
10860; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10861; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
10862; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10863; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10864; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
10865; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10866; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10867; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
10868; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
10869; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
10870; GFX12-CU-NEXT:    s_wait_storecnt 0x0
10871; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10872; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10873; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
10874; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
10875; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10876; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
10877; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10878; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10879; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10880; GFX12-CU-NEXT:    s_endpgm
10881    ptr %out, i32 %in, i32 %old) {
10882entry:
10883  %gep = getelementptr i32, ptr %out, i32 4
10884  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
10885  %val0 = extractvalue { i32, i1 } %val, 0
10886  store i32 %val0, ptr %out, align 4
10887  ret void
10888}
10889
10890define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
10891; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
10892; GFX7:       ; %bb.0: ; %entry
10893; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
10894; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10895; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
10896; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
10897; GFX7-NEXT:    s_mov_b64 s[12:13], 16
10898; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10899; GFX7-NEXT:    s_mov_b32 s6, s4
10900; GFX7-NEXT:    s_mov_b32 s7, s5
10901; GFX7-NEXT:    s_mov_b32 s11, s12
10902; GFX7-NEXT:    s_mov_b32 s10, s13
10903; GFX7-NEXT:    s_add_u32 s6, s6, s11
10904; GFX7-NEXT:    s_addc_u32 s10, s7, s10
10905; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10906; GFX7-NEXT:    s_mov_b32 s7, s10
10907; GFX7-NEXT:    v_mov_b32_e32 v2, s9
10908; GFX7-NEXT:    v_mov_b32_e32 v0, s8
10909; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10910; GFX7-NEXT:    v_mov_b32_e32 v3, v0
10911; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10912; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10913; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10914; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10915; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10916; GFX7-NEXT:    buffer_wbinvl1_vol
10917; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10918; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10919; GFX7-NEXT:    flat_store_dword v[0:1], v2
10920; GFX7-NEXT:    s_endpgm
10921;
10922; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
10923; GFX10-WGP:       ; %bb.0: ; %entry
10924; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
10925; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10926; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
10927; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
10928; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
10929; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10930; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
10931; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
10932; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
10933; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
10934; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
10935; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
10936; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10937; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
10938; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
10939; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
10940; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10941; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
10942; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10943; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10944; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10945; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
10946; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10947; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10948; GFX10-WGP-NEXT:    buffer_gl1_inv
10949; GFX10-WGP-NEXT:    buffer_gl0_inv
10950; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10951; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10952; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10953; GFX10-WGP-NEXT:    s_endpgm
10954;
10955; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
10956; GFX10-CU:       ; %bb.0: ; %entry
10957; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
10958; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
10959; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
10960; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
10961; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
10962; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10963; GFX10-CU-NEXT:    s_mov_b32 s6, s4
10964; GFX10-CU-NEXT:    s_mov_b32 s7, s5
10965; GFX10-CU-NEXT:    s_mov_b32 s11, s12
10966; GFX10-CU-NEXT:    s_mov_b32 s10, s13
10967; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
10968; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
10969; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
10970; GFX10-CU-NEXT:    s_mov_b32 s7, s10
10971; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
10972; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
10973; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
10974; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
10975; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10976; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10977; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10978; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
10979; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10980; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10981; GFX10-CU-NEXT:    buffer_gl1_inv
10982; GFX10-CU-NEXT:    buffer_gl0_inv
10983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10984; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10985; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10986; GFX10-CU-NEXT:    s_endpgm
10987;
10988; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
10989; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10990; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
10991; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
10992; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
10993; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
10994; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
10995; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10996; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
10997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
10998; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10999; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
11000; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
11001; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
11002; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
11003; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
11004; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
11005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11006; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11007; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
11008; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11009; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11010; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11011; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11012; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11013; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11014; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11015; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11016; SKIP-CACHE-INV-NEXT:    s_endpgm
11017;
11018; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11019; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11020; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11021; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
11022; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
11023; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11024; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
11025; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
11026; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11027; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11028; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11029; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
11030; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11031; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11032; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11033; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
11034; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
11035; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11036; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11037; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11038;
11039; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11040; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11041; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11042; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
11043; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
11044; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11045; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
11046; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
11047; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11048; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11049; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11050; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
11051; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11052; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11053; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11054; GFX90A-TGSPLIT-NEXT:    buffer_invl2
11055; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11056; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11057; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11058; GFX90A-TGSPLIT-NEXT:    s_endpgm
11059;
11060; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11061; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11062; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11063; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
11064; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
11065; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11066; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
11067; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
11068; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11069; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11070; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11071; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
11072; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11073; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
11074; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11075; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
11076; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11077; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11078; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11079;
11080; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11081; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11082; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11083; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
11084; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
11085; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11086; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
11087; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
11088; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11089; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11090; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11091; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
11092; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11093; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
11094; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11095; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
11096; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11097; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11098; GFX940-TGSPLIT-NEXT:    s_endpgm
11099;
11100; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11101; GFX11-WGP:       ; %bb.0: ; %entry
11102; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11103; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
11104; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
11105; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11106; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
11107; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11108; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11109; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
11110; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11111; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11112; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11113; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11114; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11115; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11116; GFX11-WGP-NEXT:    buffer_gl1_inv
11117; GFX11-WGP-NEXT:    buffer_gl0_inv
11118; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11119; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11120; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11121; GFX11-WGP-NEXT:    s_endpgm
11122;
11123; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11124; GFX11-CU:       ; %bb.0: ; %entry
11125; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11126; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
11127; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
11128; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11129; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
11130; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11131; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11132; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
11133; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11134; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11135; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11136; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
11137; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11138; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11139; GFX11-CU-NEXT:    buffer_gl1_inv
11140; GFX11-CU-NEXT:    buffer_gl0_inv
11141; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11142; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11143; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11144; GFX11-CU-NEXT:    s_endpgm
11145;
11146; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11147; GFX12-WGP:       ; %bb.0: ; %entry
11148; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11149; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
11150; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
11151; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11152; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
11153; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11154; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11155; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
11156; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11157; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11158; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
11159; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11160; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11161; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11162; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11163; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11164; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11165; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11166; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11167; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
11168; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11169; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11170; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11171; GFX12-WGP-NEXT:    s_endpgm
11172;
11173; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
11174; GFX12-CU:       ; %bb.0: ; %entry
11175; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11176; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
11177; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
11178; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11179; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
11180; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11181; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11182; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
11183; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11184; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11185; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
11186; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
11187; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
11188; GFX12-CU-NEXT:    s_wait_storecnt 0x0
11189; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11190; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11191; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
11192; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
11193; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11194; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
11195; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11196; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11197; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11198; GFX12-CU-NEXT:    s_endpgm
11199    ptr %out, i32 %in, i32 %old) {
11200entry:
11201  %gep = getelementptr i32, ptr %out, i32 4
11202  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
11203  %val0 = extractvalue { i32, i1 } %val, 0
11204  store i32 %val0, ptr %out, align 4
11205  ret void
11206}
11207
11208define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
11209; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11210; GFX7:       ; %bb.0: ; %entry
11211; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
11212; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
11213; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
11214; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
11215; GFX7-NEXT:    s_mov_b64 s[12:13], 16
11216; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11217; GFX7-NEXT:    s_mov_b32 s6, s4
11218; GFX7-NEXT:    s_mov_b32 s7, s5
11219; GFX7-NEXT:    s_mov_b32 s11, s12
11220; GFX7-NEXT:    s_mov_b32 s10, s13
11221; GFX7-NEXT:    s_add_u32 s6, s6, s11
11222; GFX7-NEXT:    s_addc_u32 s10, s7, s10
11223; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
11224; GFX7-NEXT:    s_mov_b32 s7, s10
11225; GFX7-NEXT:    v_mov_b32_e32 v2, s9
11226; GFX7-NEXT:    v_mov_b32_e32 v0, s8
11227; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11228; GFX7-NEXT:    v_mov_b32_e32 v3, v0
11229; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11230; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11231; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11232; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11233; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11234; GFX7-NEXT:    buffer_wbinvl1_vol
11235; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11236; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11237; GFX7-NEXT:    flat_store_dword v[0:1], v2
11238; GFX7-NEXT:    s_endpgm
11239;
11240; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11241; GFX10-WGP:       ; %bb.0: ; %entry
11242; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
11243; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
11244; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
11245; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
11246; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
11247; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11248; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
11249; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
11250; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
11251; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
11252; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
11253; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
11254; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
11255; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
11256; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
11257; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
11258; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11259; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
11260; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11261; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11262; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11263; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11264; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11265; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11266; GFX10-WGP-NEXT:    buffer_gl1_inv
11267; GFX10-WGP-NEXT:    buffer_gl0_inv
11268; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11269; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11270; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11271; GFX10-WGP-NEXT:    s_endpgm
11272;
11273; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11274; GFX10-CU:       ; %bb.0: ; %entry
11275; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
11276; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
11277; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
11278; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
11279; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
11280; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11281; GFX10-CU-NEXT:    s_mov_b32 s6, s4
11282; GFX10-CU-NEXT:    s_mov_b32 s7, s5
11283; GFX10-CU-NEXT:    s_mov_b32 s11, s12
11284; GFX10-CU-NEXT:    s_mov_b32 s10, s13
11285; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
11286; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
11287; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
11288; GFX10-CU-NEXT:    s_mov_b32 s7, s10
11289; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
11290; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
11291; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11292; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
11293; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11294; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11295; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11296; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
11297; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11298; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11299; GFX10-CU-NEXT:    buffer_gl1_inv
11300; GFX10-CU-NEXT:    buffer_gl0_inv
11301; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11302; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11303; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11304; GFX10-CU-NEXT:    s_endpgm
11305;
11306; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11307; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11308; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
11309; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
11310; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
11311; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
11312; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
11313; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11314; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
11315; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
11316; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
11317; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
11318; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
11319; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
11320; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
11321; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
11322; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
11323; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11324; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
11326; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11328; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11329; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11330; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11333; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11334; SKIP-CACHE-INV-NEXT:    s_endpgm
11335;
11336; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
11340; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
11341; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11342; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
11343; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
11344; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11345; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11346; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11347; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
11348; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11349; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11350; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11351; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
11352; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
11353; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11354; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11355; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11356;
11357; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11358; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11359; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11360; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
11361; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
11362; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11363; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
11364; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
11365; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11366; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11367; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11368; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
11369; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11370; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
11371; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11372; GFX90A-TGSPLIT-NEXT:    buffer_invl2
11373; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11374; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11375; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11376; GFX90A-TGSPLIT-NEXT:    s_endpgm
11377;
11378; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11379; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11380; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11381; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
11382; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
11383; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11384; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
11385; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
11386; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11387; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11388; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11389; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
11390; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11391; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
11392; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11393; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
11394; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11395; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11396; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11397;
11398; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11399; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11400; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11401; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
11402; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
11403; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11404; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
11405; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
11406; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11407; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
11408; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11409; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
11410; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11411; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
11412; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11413; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
11414; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11415; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11416; GFX940-TGSPLIT-NEXT:    s_endpgm
11417;
11418; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11419; GFX11-WGP:       ; %bb.0: ; %entry
11420; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11421; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
11422; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
11423; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11424; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
11425; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11426; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11427; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
11428; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11429; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11430; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11431; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
11432; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11433; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11434; GFX11-WGP-NEXT:    buffer_gl1_inv
11435; GFX11-WGP-NEXT:    buffer_gl0_inv
11436; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11437; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11438; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11439; GFX11-WGP-NEXT:    s_endpgm
11440;
11441; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11442; GFX11-CU:       ; %bb.0: ; %entry
11443; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11444; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
11445; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
11446; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11447; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
11448; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11449; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11450; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
11451; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11452; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11453; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11454; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
11455; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
11456; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11457; GFX11-CU-NEXT:    buffer_gl1_inv
11458; GFX11-CU-NEXT:    buffer_gl0_inv
11459; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11460; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11461; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11462; GFX11-CU-NEXT:    s_endpgm
11463;
11464; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11465; GFX12-WGP:       ; %bb.0: ; %entry
11466; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11467; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
11468; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
11469; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11470; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
11471; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11472; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11473; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
11474; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11475; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11476; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
11477; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11478; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11479; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
11480; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11481; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11482; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
11483; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
11484; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11485; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
11486; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11487; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11488; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11489; GFX12-WGP-NEXT:    s_endpgm
11490;
11491; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
11492; GFX12-CU:       ; %bb.0: ; %entry
11493; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11494; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
11495; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
11496; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11497; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
11498; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11499; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
11500; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
11501; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11502; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11503; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
11504; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
11505; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
11506; GFX12-CU-NEXT:    s_wait_storecnt 0x0
11507; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11508; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11509; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
11510; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
11511; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11512; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
11513; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11514; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11515; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11516; GFX12-CU-NEXT:    s_endpgm
11517    ptr %out, i32 %in, i32 %old) {
11518entry:
11519  %gep = getelementptr i32, ptr %out, i32 4
11520  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
11521  %val0 = extractvalue { i32, i1 } %val, 0
11522  store i32 %val0, ptr %out, align 4
11523  ret void
11524}
11525
11526define amdgpu_kernel void @flat_system_one_as_unordered_load(
11527; GFX7-LABEL: flat_system_one_as_unordered_load:
11528; GFX7:       ; %bb.0: ; %entry
11529; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11530; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
11531; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11532; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11533; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11534; GFX7-NEXT:    flat_load_dword v2, v[0:1]
11535; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11536; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11537; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11538; GFX7-NEXT:    flat_store_dword v[0:1], v2
11539; GFX7-NEXT:    s_endpgm
11540;
11541; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
11542; GFX10-WGP:       ; %bb.0: ; %entry
11543; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11544; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11545; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11546; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11547; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11548; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
11549; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11550; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11551; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11552; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11553; GFX10-WGP-NEXT:    s_endpgm
11554;
11555; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
11556; GFX10-CU:       ; %bb.0: ; %entry
11557; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11558; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11559; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11560; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11561; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11562; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
11563; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11564; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11565; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11566; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11567; GFX10-CU-NEXT:    s_endpgm
11568;
11569; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load:
11570; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11571; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11572; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
11573; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11574; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11576; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
11577; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11578; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11579; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11580; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11581; SKIP-CACHE-INV-NEXT:    s_endpgm
11582;
11583; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
11584; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11585; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11586; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11587; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11588; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11589; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
11590; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11591; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11592; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11593; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11594;
11595; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
11596; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11597; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11598; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11599; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11600; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11601; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
11602; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11603; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11604; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11605; GFX90A-TGSPLIT-NEXT:    s_endpgm
11606;
11607; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
11608; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11609; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11610; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11611; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11612; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11613; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
11614; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11615; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11616; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11617; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11618;
11619; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
11620; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11621; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11622; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11623; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11624; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11625; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
11626; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11627; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11628; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11629; GFX940-TGSPLIT-NEXT:    s_endpgm
11630;
11631; GFX11-WGP-LABEL: flat_system_one_as_unordered_load:
11632; GFX11-WGP:       ; %bb.0: ; %entry
11633; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11634; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11635; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11636; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11637; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11638; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
11639; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11640; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11641; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11642; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11643; GFX11-WGP-NEXT:    s_endpgm
11644;
11645; GFX11-CU-LABEL: flat_system_one_as_unordered_load:
11646; GFX11-CU:       ; %bb.0: ; %entry
11647; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11648; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11649; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11650; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11651; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11652; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
11653; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11654; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11655; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11656; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11657; GFX11-CU-NEXT:    s_endpgm
11658;
11659; GFX12-WGP-LABEL: flat_system_one_as_unordered_load:
11660; GFX12-WGP:       ; %bb.0: ; %entry
11661; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11662; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11663; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11664; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11665; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11666; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
11667; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11668; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11669; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11670; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11671; GFX12-WGP-NEXT:    s_endpgm
11672;
11673; GFX12-CU-LABEL: flat_system_one_as_unordered_load:
11674; GFX12-CU:       ; %bb.0: ; %entry
11675; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11676; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11677; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11678; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11679; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11680; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
11681; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11682; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11683; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11684; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11685; GFX12-CU-NEXT:    s_endpgm
11686    ptr %in, ptr %out) {
11687entry:
11688  %val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4
11689  store i32 %val, ptr %out
11690  ret void
11691}
11692
11693define amdgpu_kernel void @flat_system_one_as_monotonic_load(
11694; GFX7-LABEL: flat_system_one_as_monotonic_load:
11695; GFX7:       ; %bb.0: ; %entry
11696; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11697; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
11698; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11699; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11700; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11701; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
11702; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11703; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11704; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11705; GFX7-NEXT:    flat_store_dword v[0:1], v2
11706; GFX7-NEXT:    s_endpgm
11707;
11708; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
11709; GFX10-WGP:       ; %bb.0: ; %entry
11710; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11711; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11712; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11713; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11714; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11715; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
11716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11717; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11718; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11719; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11720; GFX10-WGP-NEXT:    s_endpgm
11721;
11722; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
11723; GFX10-CU:       ; %bb.0: ; %entry
11724; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11725; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11726; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11727; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11728; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11729; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
11730; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11731; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11732; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11733; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11734; GFX10-CU-NEXT:    s_endpgm
11735;
11736; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load:
11737; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11738; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11739; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
11740; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11741; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11742; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11743; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
11744; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11746; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11747; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11748; SKIP-CACHE-INV-NEXT:    s_endpgm
11749;
11750; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
11751; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11752; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11753; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11754; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11755; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11756; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
11757; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11758; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11759; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11760; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11761;
11762; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
11763; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11764; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11765; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11766; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11767; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11768; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
11769; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11770; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11771; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11772; GFX90A-TGSPLIT-NEXT:    s_endpgm
11773;
11774; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
11775; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11776; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11777; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11778; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11779; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11780; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
11781; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11782; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11783; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11784; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11785;
11786; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
11787; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11788; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11789; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11790; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11791; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11792; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
11793; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11794; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11795; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11796; GFX940-TGSPLIT-NEXT:    s_endpgm
11797;
11798; GFX11-WGP-LABEL: flat_system_one_as_monotonic_load:
11799; GFX11-WGP:       ; %bb.0: ; %entry
11800; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11801; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11802; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11803; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11804; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11805; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
11806; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11807; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11808; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11809; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11810; GFX11-WGP-NEXT:    s_endpgm
11811;
11812; GFX11-CU-LABEL: flat_system_one_as_monotonic_load:
11813; GFX11-CU:       ; %bb.0: ; %entry
11814; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11815; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11816; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11817; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11818; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11819; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
11820; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11821; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11822; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11823; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11824; GFX11-CU-NEXT:    s_endpgm
11825;
11826; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load:
11827; GFX12-WGP:       ; %bb.0: ; %entry
11828; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11829; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11830; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11831; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11832; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11833; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
11834; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11835; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11836; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11837; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11838; GFX12-WGP-NEXT:    s_endpgm
11839;
11840; GFX12-CU-LABEL: flat_system_one_as_monotonic_load:
11841; GFX12-CU:       ; %bb.0: ; %entry
11842; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11843; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11844; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11845; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11846; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11847; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
11848; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11849; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11850; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11851; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11852; GFX12-CU-NEXT:    s_endpgm
11853    ptr %in, ptr %out) {
11854entry:
11855  %val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4
11856  store i32 %val, ptr %out
11857  ret void
11858}
11859
11860define amdgpu_kernel void @flat_system_one_as_acquire_load(
11861; GFX7-LABEL: flat_system_one_as_acquire_load:
11862; GFX7:       ; %bb.0: ; %entry
11863; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11864; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
11865; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11866; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11867; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11868; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
11869; GFX7-NEXT:    s_waitcnt vmcnt(0)
11870; GFX7-NEXT:    buffer_wbinvl1_vol
11871; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11872; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11873; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11874; GFX7-NEXT:    flat_store_dword v[0:1], v2
11875; GFX7-NEXT:    s_endpgm
11876;
11877; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
11878; GFX10-WGP:       ; %bb.0: ; %entry
11879; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11880; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11881; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11882; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11883; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11884; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
11885; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11886; GFX10-WGP-NEXT:    buffer_gl1_inv
11887; GFX10-WGP-NEXT:    buffer_gl0_inv
11888; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11889; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11890; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11891; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11892; GFX10-WGP-NEXT:    s_endpgm
11893;
11894; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
11895; GFX10-CU:       ; %bb.0: ; %entry
11896; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11897; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11898; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11900; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11901; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
11902; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
11903; GFX10-CU-NEXT:    buffer_gl1_inv
11904; GFX10-CU-NEXT:    buffer_gl0_inv
11905; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11906; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11907; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11908; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11909; GFX10-CU-NEXT:    s_endpgm
11910;
11911; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load:
11912; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11913; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11914; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
11915; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11918; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
11919; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
11920; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11922; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11923; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11924; SKIP-CACHE-INV-NEXT:    s_endpgm
11925;
11926; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
11927; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11928; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11929; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11930; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11931; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11932; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
11933; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11934; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
11935; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
11936; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11937; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11938; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11939; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11940;
11941; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
11942; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11943; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11944; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
11945; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11946; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11947; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
11948; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11949; GFX90A-TGSPLIT-NEXT:    buffer_invl2
11950; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
11951; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11952; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11953; GFX90A-TGSPLIT-NEXT:    s_endpgm
11954;
11955; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
11956; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11957; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11958; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11959; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11960; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11961; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
11962; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11963; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
11964; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11965; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11966; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11967; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11968;
11969; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
11970; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11971; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11972; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
11973; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11974; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11975; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
11976; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11977; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
11978; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11979; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11980; GFX940-TGSPLIT-NEXT:    s_endpgm
11981;
11982; GFX11-WGP-LABEL: flat_system_one_as_acquire_load:
11983; GFX11-WGP:       ; %bb.0: ; %entry
11984; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11985; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
11986; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11987; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11988; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11989; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
11990; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11991; GFX11-WGP-NEXT:    buffer_gl1_inv
11992; GFX11-WGP-NEXT:    buffer_gl0_inv
11993; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11994; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11995; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11996; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11997; GFX11-WGP-NEXT:    s_endpgm
11998;
11999; GFX11-CU-LABEL: flat_system_one_as_acquire_load:
12000; GFX11-CU:       ; %bb.0: ; %entry
12001; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12002; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12003; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12004; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12005; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12006; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
12007; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12008; GFX11-CU-NEXT:    buffer_gl1_inv
12009; GFX11-CU-NEXT:    buffer_gl0_inv
12010; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12011; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12012; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12013; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12014; GFX11-CU-NEXT:    s_endpgm
12015;
12016; GFX12-WGP-LABEL: flat_system_one_as_acquire_load:
12017; GFX12-WGP:       ; %bb.0: ; %entry
12018; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12019; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12020; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12021; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12022; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12023; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
12024; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12025; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
12026; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12027; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12028; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
12029; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12030; GFX12-WGP-NEXT:    s_endpgm
12031;
12032; GFX12-CU-LABEL: flat_system_one_as_acquire_load:
12033; GFX12-CU:       ; %bb.0: ; %entry
12034; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12035; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12036; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12037; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12038; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12039; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
12040; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12041; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
12042; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12043; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12044; GFX12-CU-NEXT:    s_wait_dscnt 0x0
12045; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12046; GFX12-CU-NEXT:    s_endpgm
12047    ptr %in, ptr %out) {
12048entry:
12049  %val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4
12050  store i32 %val, ptr %out
12051  ret void
12052}
12053
12054define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
12055; GFX7-LABEL: flat_system_one_as_seq_cst_load:
12056; GFX7:       ; %bb.0: ; %entry
12057; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12058; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
12059; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12060; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12061; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12062; GFX7-NEXT:    s_waitcnt vmcnt(0)
12063; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
12064; GFX7-NEXT:    s_waitcnt vmcnt(0)
12065; GFX7-NEXT:    buffer_wbinvl1_vol
12066; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12067; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12068; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12069; GFX7-NEXT:    flat_store_dword v[0:1], v2
12070; GFX7-NEXT:    s_endpgm
12071;
12072; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
12073; GFX10-WGP:       ; %bb.0: ; %entry
12074; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12075; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
12076; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12077; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12078; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12079; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12080; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12081; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
12082; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12083; GFX10-WGP-NEXT:    buffer_gl1_inv
12084; GFX10-WGP-NEXT:    buffer_gl0_inv
12085; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12086; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12087; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12088; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12089; GFX10-WGP-NEXT:    s_endpgm
12090;
12091; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
12092; GFX10-CU:       ; %bb.0: ; %entry
12093; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12094; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
12095; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12096; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12097; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12098; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
12099; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12100; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
12101; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
12102; GFX10-CU-NEXT:    buffer_gl1_inv
12103; GFX10-CU-NEXT:    buffer_gl0_inv
12104; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12105; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12106; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12107; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12108; GFX10-CU-NEXT:    s_endpgm
12109;
12110; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load:
12111; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12112; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12113; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
12114; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12115; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12116; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12117; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
12118; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
12119; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
12120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12122; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12123; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12124; SKIP-CACHE-INV-NEXT:    s_endpgm
12125;
12126; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
12127; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12128; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12129; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
12130; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12131; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12132; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12133; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
12134; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12135; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
12136; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
12137; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12138; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12139; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12140; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12141;
12142; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
12143; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12144; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12145; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
12146; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12147; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12148; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12149; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
12150; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12151; GFX90A-TGSPLIT-NEXT:    buffer_invl2
12152; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
12153; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12154; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12155; GFX90A-TGSPLIT-NEXT:    s_endpgm
12156;
12157; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
12158; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12159; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12160; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
12161; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12162; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12163; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12164; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
12165; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12166; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
12167; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12168; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12169; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12170; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12171;
12172; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
12173; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12174; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12175; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
12176; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12177; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12178; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12179; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
12180; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12181; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
12182; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12183; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12184; GFX940-TGSPLIT-NEXT:    s_endpgm
12185;
12186; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_load:
12187; GFX11-WGP:       ; %bb.0: ; %entry
12188; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12189; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12190; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12191; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12192; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12193; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12194; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12195; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
12196; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12197; GFX11-WGP-NEXT:    buffer_gl1_inv
12198; GFX11-WGP-NEXT:    buffer_gl0_inv
12199; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12200; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12201; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12202; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12203; GFX11-WGP-NEXT:    s_endpgm
12204;
12205; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load:
12206; GFX11-CU:       ; %bb.0: ; %entry
12207; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12208; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12209; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12210; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12211; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12212; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12213; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12214; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
12215; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12216; GFX11-CU-NEXT:    buffer_gl1_inv
12217; GFX11-CU-NEXT:    buffer_gl0_inv
12218; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12219; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12220; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12221; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12222; GFX11-CU-NEXT:    s_endpgm
12223;
12224; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load:
12225; GFX12-WGP:       ; %bb.0: ; %entry
12226; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12227; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12228; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12229; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12230; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12231; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12232; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12233; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12234; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12235; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
12236; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12237; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12238; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12239; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
12240; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12241; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12242; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
12243; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12244; GFX12-WGP-NEXT:    s_endpgm
12245;
12246; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load:
12247; GFX12-CU:       ; %bb.0: ; %entry
12248; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12249; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
12250; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12251; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12252; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12253; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
12254; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
12255; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12256; GFX12-CU-NEXT:    s_wait_storecnt 0x0
12257; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
12258; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
12259; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
12260; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12261; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
12262; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12263; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12264; GFX12-CU-NEXT:    s_wait_dscnt 0x0
12265; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12266; GFX12-CU-NEXT:    s_endpgm
12267    ptr %in, ptr %out) {
12268entry:
12269  %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
12270  store i32 %val, ptr %out
12271  ret void
12272}
12273
12274define amdgpu_kernel void @flat_system_one_as_unordered_store(
12275; GFX7-LABEL: flat_system_one_as_unordered_store:
12276; GFX7:       ; %bb.0: ; %entry
12277; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
12278; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
12279; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12280; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12281; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12282; GFX7-NEXT:    v_mov_b32_e32 v2, s4
12283; GFX7-NEXT:    flat_store_dword v[0:1], v2
12284; GFX7-NEXT:    s_endpgm
12285;
12286; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
12287; GFX10-WGP:       ; %bb.0: ; %entry
12288; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
12289; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12290; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12291; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12292; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12293; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
12294; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12295; GFX10-WGP-NEXT:    s_endpgm
12296;
12297; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
12298; GFX10-CU:       ; %bb.0: ; %entry
12299; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
12300; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12301; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12302; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12303; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12304; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
12305; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12306; GFX10-CU-NEXT:    s_endpgm
12307;
12308; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store:
12309; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12310; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
12311; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12312; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12313; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12314; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12316; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12317; SKIP-CACHE-INV-NEXT:    s_endpgm
12318;
12319; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
12320; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12321; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12322; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12323; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12324; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12325; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12326; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12327; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12328;
12329; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
12330; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12331; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12332; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12333; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12334; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12335; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12336; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12337; GFX90A-TGSPLIT-NEXT:    s_endpgm
12338;
12339; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
12340; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12341; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12342; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12343; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12344; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12345; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12346; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12347; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12348;
12349; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
12350; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12351; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12352; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12353; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12354; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12355; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12356; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12357; GFX940-TGSPLIT-NEXT:    s_endpgm
12358;
12359; GFX11-WGP-LABEL: flat_system_one_as_unordered_store:
12360; GFX11-WGP:       ; %bb.0: ; %entry
12361; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12362; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12363; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12364; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12365; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12366; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
12367; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12368; GFX11-WGP-NEXT:    s_endpgm
12369;
12370; GFX11-CU-LABEL: flat_system_one_as_unordered_store:
12371; GFX11-CU:       ; %bb.0: ; %entry
12372; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12373; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12374; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12375; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12376; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12377; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
12378; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12379; GFX11-CU-NEXT:    s_endpgm
12380;
12381; GFX12-WGP-LABEL: flat_system_one_as_unordered_store:
12382; GFX12-WGP:       ; %bb.0: ; %entry
12383; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12384; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12385; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12386; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12387; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12388; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
12389; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12390; GFX12-WGP-NEXT:    s_endpgm
12391;
12392; GFX12-CU-LABEL: flat_system_one_as_unordered_store:
12393; GFX12-CU:       ; %bb.0: ; %entry
12394; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12395; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12396; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12397; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12398; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12399; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
12400; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12401; GFX12-CU-NEXT:    s_endpgm
12402    i32 %in, ptr %out) {
12403entry:
12404  store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
12405  ret void
12406}
12407
12408define amdgpu_kernel void @flat_system_one_as_monotonic_store(
12409; GFX7-LABEL: flat_system_one_as_monotonic_store:
12410; GFX7:       ; %bb.0: ; %entry
12411; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
12412; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
12413; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12414; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12415; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12416; GFX7-NEXT:    v_mov_b32_e32 v2, s4
12417; GFX7-NEXT:    flat_store_dword v[0:1], v2
12418; GFX7-NEXT:    s_endpgm
12419;
12420; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
12421; GFX10-WGP:       ; %bb.0: ; %entry
12422; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
12423; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12424; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12425; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12426; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12427; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
12428; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12429; GFX10-WGP-NEXT:    s_endpgm
12430;
12431; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
12432; GFX10-CU:       ; %bb.0: ; %entry
12433; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
12434; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12435; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12436; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12437; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12438; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
12439; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12440; GFX10-CU-NEXT:    s_endpgm
12441;
12442; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store:
12443; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12444; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
12445; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12446; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12450; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12451; SKIP-CACHE-INV-NEXT:    s_endpgm
12452;
12453; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
12454; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12455; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12456; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12457; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12458; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12459; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12460; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12461; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12462;
12463; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
12464; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12465; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12466; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12467; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12468; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12469; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12470; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12471; GFX90A-TGSPLIT-NEXT:    s_endpgm
12472;
12473; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
12474; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12475; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12476; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12477; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12478; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12479; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12480; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12481; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12482;
12483; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
12484; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12485; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12486; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12487; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12488; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12489; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12490; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12491; GFX940-TGSPLIT-NEXT:    s_endpgm
12492;
12493; GFX11-WGP-LABEL: flat_system_one_as_monotonic_store:
12494; GFX11-WGP:       ; %bb.0: ; %entry
12495; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12496; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12497; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12498; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12499; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12500; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
12501; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12502; GFX11-WGP-NEXT:    s_endpgm
12503;
12504; GFX11-CU-LABEL: flat_system_one_as_monotonic_store:
12505; GFX11-CU:       ; %bb.0: ; %entry
12506; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12507; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12508; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12509; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12510; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12511; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
12512; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12513; GFX11-CU-NEXT:    s_endpgm
12514;
12515; GFX12-WGP-LABEL: flat_system_one_as_monotonic_store:
12516; GFX12-WGP:       ; %bb.0: ; %entry
12517; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12518; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12519; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12520; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12521; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12522; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
12523; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12524; GFX12-WGP-NEXT:    s_endpgm
12525;
12526; GFX12-CU-LABEL: flat_system_one_as_monotonic_store:
12527; GFX12-CU:       ; %bb.0: ; %entry
12528; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12529; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12530; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12531; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12532; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12533; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
12534; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12535; GFX12-CU-NEXT:    s_endpgm
12536    i32 %in, ptr %out) {
12537entry:
12538  store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
12539  ret void
12540}
12541
12542define amdgpu_kernel void @flat_system_one_as_release_store(
12543; GFX7-LABEL: flat_system_one_as_release_store:
12544; GFX7:       ; %bb.0: ; %entry
12545; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
12546; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
12547; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12548; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12549; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12550; GFX7-NEXT:    v_mov_b32_e32 v2, s4
12551; GFX7-NEXT:    s_waitcnt vmcnt(0)
12552; GFX7-NEXT:    flat_store_dword v[0:1], v2
12553; GFX7-NEXT:    s_endpgm
12554;
12555; GFX10-WGP-LABEL: flat_system_one_as_release_store:
12556; GFX10-WGP:       ; %bb.0: ; %entry
12557; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
12558; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12559; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12560; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12561; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12562; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
12563; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12564; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12565; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12566; GFX10-WGP-NEXT:    s_endpgm
12567;
12568; GFX10-CU-LABEL: flat_system_one_as_release_store:
12569; GFX10-CU:       ; %bb.0: ; %entry
12570; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
12571; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12572; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12573; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12574; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12575; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
12576; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
12577; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12578; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12579; GFX10-CU-NEXT:    s_endpgm
12580;
12581; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store:
12582; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12583; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
12584; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12585; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12586; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12587; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12588; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12589; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
12590; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12591; SKIP-CACHE-INV-NEXT:    s_endpgm
12592;
12593; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
12594; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12595; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12596; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12597; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12598; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12599; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12600; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
12601; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12602; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12603; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12604;
12605; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
12606; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12607; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12608; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12609; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12610; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12611; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12612; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
12613; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12614; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12615; GFX90A-TGSPLIT-NEXT:    s_endpgm
12616;
12617; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
12618; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12619; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12620; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12621; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12622; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12623; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12624; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
12625; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12626; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12627; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12628;
12629; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_store:
12630; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12631; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12632; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12633; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12634; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12635; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12636; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
12637; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12638; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12639; GFX940-TGSPLIT-NEXT:    s_endpgm
12640;
12641; GFX11-WGP-LABEL: flat_system_one_as_release_store:
12642; GFX11-WGP:       ; %bb.0: ; %entry
12643; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12644; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12645; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12646; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12647; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12648; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
12649; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12650; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12651; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12652; GFX11-WGP-NEXT:    s_endpgm
12653;
12654; GFX11-CU-LABEL: flat_system_one_as_release_store:
12655; GFX11-CU:       ; %bb.0: ; %entry
12656; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12657; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12658; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12659; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12660; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12661; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
12662; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12663; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12664; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12665; GFX11-CU-NEXT:    s_endpgm
12666;
12667; GFX12-WGP-LABEL: flat_system_one_as_release_store:
12668; GFX12-WGP:       ; %bb.0: ; %entry
12669; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12670; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12671; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12672; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12673; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12674; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
12675; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
12676; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12677; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12678; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12679; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12680; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12681; GFX12-WGP-NEXT:    s_endpgm
12682;
12683; GFX12-CU-LABEL: flat_system_one_as_release_store:
12684; GFX12-CU:       ; %bb.0: ; %entry
12685; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12686; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12687; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12688; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12689; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12690; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
12691; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
12692; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
12693; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
12694; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12695; GFX12-CU-NEXT:    s_wait_storecnt 0x0
12696; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12697; GFX12-CU-NEXT:    s_endpgm
12698    i32 %in, ptr %out) {
12699entry:
12700  store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
12701  ret void
12702}
12703
12704define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
12705; GFX7-LABEL: flat_system_one_as_seq_cst_store:
12706; GFX7:       ; %bb.0: ; %entry
12707; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
12708; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
12709; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12710; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12711; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12712; GFX7-NEXT:    v_mov_b32_e32 v2, s4
12713; GFX7-NEXT:    s_waitcnt vmcnt(0)
12714; GFX7-NEXT:    flat_store_dword v[0:1], v2
12715; GFX7-NEXT:    s_endpgm
12716;
12717; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
12718; GFX10-WGP:       ; %bb.0: ; %entry
12719; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
12720; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12721; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12722; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12723; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12724; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
12725; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12726; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12727; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12728; GFX10-WGP-NEXT:    s_endpgm
12729;
12730; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
12731; GFX10-CU:       ; %bb.0: ; %entry
12732; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
12733; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12734; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12735; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12736; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12737; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
12738; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
12739; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12740; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12741; GFX10-CU-NEXT:    s_endpgm
12742;
12743; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store:
12744; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12745; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
12746; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
12747; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12748; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12749; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12750; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12751; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
12752; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12753; SKIP-CACHE-INV-NEXT:    s_endpgm
12754;
12755; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
12756; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12757; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12758; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12759; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12760; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12761; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12762; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
12763; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12764; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12765; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12766;
12767; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
12768; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12769; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
12770; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
12771; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12772; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12773; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12774; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
12775; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12776; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12777; GFX90A-TGSPLIT-NEXT:    s_endpgm
12778;
12779; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
12780; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12781; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12782; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12783; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12784; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12785; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12786; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
12787; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12788; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12789; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12790;
12791; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
12792; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12793; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
12794; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
12795; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12796; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12797; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12798; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
12799; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12800; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12801; GFX940-TGSPLIT-NEXT:    s_endpgm
12802;
12803; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_store:
12804; GFX11-WGP:       ; %bb.0: ; %entry
12805; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12806; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12807; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12808; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12809; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12810; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
12811; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12812; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
12813; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12814; GFX11-WGP-NEXT:    s_endpgm
12815;
12816; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store:
12817; GFX11-CU:       ; %bb.0: ; %entry
12818; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12819; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12820; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12821; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12822; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12823; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
12824; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12825; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
12826; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12827; GFX11-CU-NEXT:    s_endpgm
12828;
12829; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_store:
12830; GFX12-WGP:       ; %bb.0: ; %entry
12831; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
12832; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12833; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12834; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12835; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12836; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
12837; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
12838; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
12839; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
12840; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12841; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
12842; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12843; GFX12-WGP-NEXT:    s_endpgm
12844;
12845; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store:
12846; GFX12-CU:       ; %bb.0: ; %entry
12847; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
12848; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
12849; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12850; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12851; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12852; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
12853; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
12854; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
12855; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
12856; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12857; GFX12-CU-NEXT:    s_wait_storecnt 0x0
12858; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
12859; GFX12-CU-NEXT:    s_endpgm
12860    i32 %in, ptr %out) {
12861entry:
12862  store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
12863  ret void
12864}
12865
12866define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
12867; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
12868; GFX7:       ; %bb.0: ; %entry
12869; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12870; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
12871; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12872; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12873; GFX7-NEXT:    v_mov_b32_e32 v1, s7
12874; GFX7-NEXT:    v_mov_b32_e32 v2, s4
12875; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
12876; GFX7-NEXT:    s_endpgm
12877;
12878; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
12879; GFX10-WGP:       ; %bb.0: ; %entry
12880; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12881; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
12882; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12883; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12884; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12885; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
12886; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
12887; GFX10-WGP-NEXT:    s_endpgm
12888;
12889; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
12890; GFX10-CU:       ; %bb.0: ; %entry
12891; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12892; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
12893; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12895; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12896; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
12897; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
12898; GFX10-CU-NEXT:    s_endpgm
12899;
12900; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw:
12901; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12902; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12903; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
12904; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12905; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12906; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
12907; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
12908; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
12909; SKIP-CACHE-INV-NEXT:    s_endpgm
12910;
12911; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
12912; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12913; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12914; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
12915; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12916; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12917; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12918; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
12919; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12920;
12921; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
12922; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12923; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
12924; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
12925; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12926; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
12927; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
12928; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
12929; GFX90A-TGSPLIT-NEXT:    s_endpgm
12930;
12931; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
12932; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12933; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12934; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
12935; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12936; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12937; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12938; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
12939; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12940;
12941; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
12942; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12943; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
12944; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
12945; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12946; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
12947; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
12948; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
12949; GFX940-TGSPLIT-NEXT:    s_endpgm
12950;
12951; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
12952; GFX11-WGP:       ; %bb.0: ; %entry
12953; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12954; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
12955; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12956; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12957; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12958; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
12959; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
12960; GFX11-WGP-NEXT:    s_endpgm
12961;
12962; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
12963; GFX11-CU:       ; %bb.0: ; %entry
12964; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12965; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
12966; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12967; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12968; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12969; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
12970; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
12971; GFX11-CU-NEXT:    s_endpgm
12972;
12973; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
12974; GFX12-WGP:       ; %bb.0: ; %entry
12975; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12976; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
12977; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12978; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12979; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12980; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
12981; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
12982; GFX12-WGP-NEXT:    s_endpgm
12983;
12984; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
12985; GFX12-CU:       ; %bb.0: ; %entry
12986; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
12987; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
12988; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12989; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12990; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12991; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
12992; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
12993; GFX12-CU-NEXT:    s_endpgm
12994    ptr %out, i32 %in) {
12995entry:
12996  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
12997  ret void
12998}
12999
13000define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
13001; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
13002; GFX7:       ; %bb.0: ; %entry
13003; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13004; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
13005; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13006; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13007; GFX7-NEXT:    v_mov_b32_e32 v1, s7
13008; GFX7-NEXT:    v_mov_b32_e32 v2, s4
13009; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
13010; GFX7-NEXT:    s_waitcnt vmcnt(0)
13011; GFX7-NEXT:    buffer_wbinvl1_vol
13012; GFX7-NEXT:    s_endpgm
13013;
13014; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
13015; GFX10-WGP:       ; %bb.0: ; %entry
13016; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13017; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
13018; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13019; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13020; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13021; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
13022; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
13023; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13024; GFX10-WGP-NEXT:    buffer_gl1_inv
13025; GFX10-WGP-NEXT:    buffer_gl0_inv
13026; GFX10-WGP-NEXT:    s_endpgm
13027;
13028; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
13029; GFX10-CU:       ; %bb.0: ; %entry
13030; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13031; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
13032; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13033; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13034; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13035; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
13036; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
13037; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13038; GFX10-CU-NEXT:    buffer_gl1_inv
13039; GFX10-CU-NEXT:    buffer_gl0_inv
13040; GFX10-CU-NEXT:    s_endpgm
13041;
13042; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw:
13043; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13044; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13045; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
13046; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13047; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13048; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
13049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
13050; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
13051; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13052; SKIP-CACHE-INV-NEXT:    s_endpgm
13053;
13054; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
13055; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13056; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13057; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13058; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13059; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13060; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13061; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13062; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13063; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
13064; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
13065; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13066;
13067; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
13068; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13069; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13070; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13071; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13072; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13073; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13074; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13075; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13076; GFX90A-TGSPLIT-NEXT:    buffer_invl2
13077; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13078; GFX90A-TGSPLIT-NEXT:    s_endpgm
13079;
13080; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
13081; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13082; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13083; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13084; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13085; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13086; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13087; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13088; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13089; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
13090; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13091;
13092; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
13093; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13094; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13095; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13096; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13097; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13098; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13099; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13100; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13101; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
13102; GFX940-TGSPLIT-NEXT:    s_endpgm
13103;
13104; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
13105; GFX11-WGP:       ; %bb.0: ; %entry
13106; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13107; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13108; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13109; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13110; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13111; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
13112; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13113; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13114; GFX11-WGP-NEXT:    buffer_gl1_inv
13115; GFX11-WGP-NEXT:    buffer_gl0_inv
13116; GFX11-WGP-NEXT:    s_endpgm
13117;
13118; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
13119; GFX11-CU:       ; %bb.0: ; %entry
13120; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13121; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13122; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13123; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13124; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13125; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
13126; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13127; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13128; GFX11-CU-NEXT:    buffer_gl1_inv
13129; GFX11-CU-NEXT:    buffer_gl0_inv
13130; GFX11-CU-NEXT:    s_endpgm
13131;
13132; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
13133; GFX12-WGP:       ; %bb.0: ; %entry
13134; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13135; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13136; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13137; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13138; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13139; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
13140; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13141; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13142; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
13143; GFX12-WGP-NEXT:    s_endpgm
13144;
13145; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
13146; GFX12-CU:       ; %bb.0: ; %entry
13147; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13148; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13149; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13150; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13151; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13152; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
13153; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13154; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13155; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
13156; GFX12-CU-NEXT:    s_endpgm
13157    ptr %out, i32 %in) {
13158entry:
13159  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
13160  ret void
13161}
13162
13163define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
13164; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
13165; GFX7:       ; %bb.0: ; %entry
13166; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13167; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
13168; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13169; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13170; GFX7-NEXT:    v_mov_b32_e32 v1, s7
13171; GFX7-NEXT:    v_mov_b32_e32 v2, s4
13172; GFX7-NEXT:    s_waitcnt vmcnt(0)
13173; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
13174; GFX7-NEXT:    s_endpgm
13175;
13176; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
13177; GFX10-WGP:       ; %bb.0: ; %entry
13178; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13179; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
13180; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13181; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13182; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13183; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
13184; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13185; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13186; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
13187; GFX10-WGP-NEXT:    s_endpgm
13188;
13189; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
13190; GFX10-CU:       ; %bb.0: ; %entry
13191; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13192; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
13193; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13194; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13195; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13196; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
13197; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13198; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13199; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
13200; GFX10-CU-NEXT:    s_endpgm
13201;
13202; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw:
13203; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13204; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13205; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
13206; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13207; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
13209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
13210; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13211; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
13212; SKIP-CACHE-INV-NEXT:    s_endpgm
13213;
13214; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
13215; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13216; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13217; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13218; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13219; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13220; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13221; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
13222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13223; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13224; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13225;
13226; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
13227; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13228; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13229; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13230; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13231; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13232; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13233; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
13234; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13235; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13236; GFX90A-TGSPLIT-NEXT:    s_endpgm
13237;
13238; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
13239; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13240; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13241; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13242; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13243; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13244; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13245; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13246; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13247; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13248; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13249;
13250; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
13251; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13252; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13253; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13254; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13255; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13256; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13257; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13258; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13259; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13260; GFX940-TGSPLIT-NEXT:    s_endpgm
13261;
13262; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw:
13263; GFX11-WGP:       ; %bb.0: ; %entry
13264; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13265; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13266; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13267; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13268; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13269; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
13270; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13271; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13272; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13273; GFX11-WGP-NEXT:    s_endpgm
13274;
13275; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw:
13276; GFX11-CU:       ; %bb.0: ; %entry
13277; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13278; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13279; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13280; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13281; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13282; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
13283; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
13284; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13285; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13286; GFX11-CU-NEXT:    s_endpgm
13287;
13288; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw:
13289; GFX12-WGP:       ; %bb.0: ; %entry
13290; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13291; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13292; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13293; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13294; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13295; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
13296; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
13297; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13298; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13299; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13300; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13301; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13302; GFX12-WGP-NEXT:    s_endpgm
13303;
13304; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw:
13305; GFX12-CU:       ; %bb.0: ; %entry
13306; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13307; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13308; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13309; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13310; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13311; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
13312; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
13313; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
13314; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
13315; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
13316; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13317; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13318; GFX12-CU-NEXT:    s_endpgm
13319    ptr %out, i32 %in) {
13320entry:
13321  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
13322  ret void
13323}
13324
13325define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
13326; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13327; GFX7:       ; %bb.0: ; %entry
13328; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13329; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
13330; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13331; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13332; GFX7-NEXT:    v_mov_b32_e32 v1, s7
13333; GFX7-NEXT:    v_mov_b32_e32 v2, s4
13334; GFX7-NEXT:    s_waitcnt vmcnt(0)
13335; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
13336; GFX7-NEXT:    s_waitcnt vmcnt(0)
13337; GFX7-NEXT:    buffer_wbinvl1_vol
13338; GFX7-NEXT:    s_endpgm
13339;
13340; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13341; GFX10-WGP:       ; %bb.0: ; %entry
13342; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13343; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
13344; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13345; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13346; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13347; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
13348; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13349; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13350; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
13351; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13352; GFX10-WGP-NEXT:    buffer_gl1_inv
13353; GFX10-WGP-NEXT:    buffer_gl0_inv
13354; GFX10-WGP-NEXT:    s_endpgm
13355;
13356; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13357; GFX10-CU:       ; %bb.0: ; %entry
13358; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13359; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
13360; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13361; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13362; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13363; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
13364; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13365; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13366; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
13367; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13368; GFX10-CU-NEXT:    buffer_gl1_inv
13369; GFX10-CU-NEXT:    buffer_gl0_inv
13370; GFX10-CU-NEXT:    s_endpgm
13371;
13372; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13373; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13374; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13375; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
13376; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13378; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
13379; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
13380; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13381; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
13382; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13383; SKIP-CACHE-INV-NEXT:    s_endpgm
13384;
13385; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13386; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13387; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13388; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13389; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13390; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13391; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13392; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
13393; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13394; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13395; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13396; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
13397; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
13398; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13399;
13400; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13401; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13402; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13403; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13404; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13405; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13406; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13407; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
13408; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13409; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13410; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13411; GFX90A-TGSPLIT-NEXT:    buffer_invl2
13412; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13413; GFX90A-TGSPLIT-NEXT:    s_endpgm
13414;
13415; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13416; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13417; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13418; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13419; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13420; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13421; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13422; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13423; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13424; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13425; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13426; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
13427; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13428;
13429; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13430; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13431; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13432; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13433; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13434; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13435; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13436; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13437; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13438; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13439; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13440; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
13441; GFX940-TGSPLIT-NEXT:    s_endpgm
13442;
13443; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13444; GFX11-WGP:       ; %bb.0: ; %entry
13445; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13446; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13447; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13448; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13449; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13450; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
13451; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13452; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13453; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13454; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13455; GFX11-WGP-NEXT:    buffer_gl1_inv
13456; GFX11-WGP-NEXT:    buffer_gl0_inv
13457; GFX11-WGP-NEXT:    s_endpgm
13458;
13459; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13460; GFX11-CU:       ; %bb.0: ; %entry
13461; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13462; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13463; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13464; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13465; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13466; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
13467; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
13468; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13469; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13470; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13471; GFX11-CU-NEXT:    buffer_gl1_inv
13472; GFX11-CU-NEXT:    buffer_gl0_inv
13473; GFX11-CU-NEXT:    s_endpgm
13474;
13475; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13476; GFX12-WGP:       ; %bb.0: ; %entry
13477; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13478; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13479; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13480; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13481; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13482; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
13483; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
13484; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13485; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13486; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13487; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13488; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13489; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13490; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
13491; GFX12-WGP-NEXT:    s_endpgm
13492;
13493; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
13494; GFX12-CU:       ; %bb.0: ; %entry
13495; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13496; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13497; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13498; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13499; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13500; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
13501; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
13502; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
13503; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
13504; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
13505; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13506; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13507; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13508; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
13509; GFX12-CU-NEXT:    s_endpgm
13510    ptr %out, i32 %in) {
13511entry:
13512  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
13513  ret void
13514}
13515
13516define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
13517; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13518; GFX7:       ; %bb.0: ; %entry
13519; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13520; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
13521; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13522; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13523; GFX7-NEXT:    v_mov_b32_e32 v1, s7
13524; GFX7-NEXT:    v_mov_b32_e32 v2, s4
13525; GFX7-NEXT:    s_waitcnt vmcnt(0)
13526; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
13527; GFX7-NEXT:    s_waitcnt vmcnt(0)
13528; GFX7-NEXT:    buffer_wbinvl1_vol
13529; GFX7-NEXT:    s_endpgm
13530;
13531; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13532; GFX10-WGP:       ; %bb.0: ; %entry
13533; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13534; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
13535; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13536; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13537; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13538; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
13539; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13540; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13541; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
13542; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13543; GFX10-WGP-NEXT:    buffer_gl1_inv
13544; GFX10-WGP-NEXT:    buffer_gl0_inv
13545; GFX10-WGP-NEXT:    s_endpgm
13546;
13547; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13548; GFX10-CU:       ; %bb.0: ; %entry
13549; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13550; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
13551; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13552; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13553; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13554; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
13555; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13556; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13557; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
13558; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13559; GFX10-CU-NEXT:    buffer_gl1_inv
13560; GFX10-CU-NEXT:    buffer_gl0_inv
13561; GFX10-CU-NEXT:    s_endpgm
13562;
13563; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13564; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13565; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13566; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
13567; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13569; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
13570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
13571; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13572; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
13573; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13574; SKIP-CACHE-INV-NEXT:    s_endpgm
13575;
13576; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13577; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13578; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13579; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13581; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13582; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13583; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
13584; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13585; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13586; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13587; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
13588; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
13589; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13590;
13591; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13592; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13593; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
13594; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
13595; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13596; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
13597; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
13598; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
13599; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13600; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
13601; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13602; GFX90A-TGSPLIT-NEXT:    buffer_invl2
13603; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13604; GFX90A-TGSPLIT-NEXT:    s_endpgm
13605;
13606; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13607; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13608; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13609; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13610; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13611; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13612; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13613; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13614; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13615; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13616; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13617; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
13618; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13619;
13620; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13621; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13622; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
13623; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
13624; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13625; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
13626; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
13627; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
13628; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13629; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
13630; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13631; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
13632; GFX940-TGSPLIT-NEXT:    s_endpgm
13633;
13634; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13635; GFX11-WGP:       ; %bb.0: ; %entry
13636; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13637; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13638; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13639; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13640; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13641; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
13642; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13643; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13644; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13645; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13646; GFX11-WGP-NEXT:    buffer_gl1_inv
13647; GFX11-WGP-NEXT:    buffer_gl0_inv
13648; GFX11-WGP-NEXT:    s_endpgm
13649;
13650; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13651; GFX11-CU:       ; %bb.0: ; %entry
13652; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13653; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13654; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13655; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13656; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13657; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
13658; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
13659; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13660; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
13661; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13662; GFX11-CU-NEXT:    buffer_gl1_inv
13663; GFX11-CU-NEXT:    buffer_gl0_inv
13664; GFX11-CU-NEXT:    s_endpgm
13665;
13666; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13667; GFX12-WGP:       ; %bb.0: ; %entry
13668; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13669; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
13670; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13671; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13672; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13673; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
13674; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
13675; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
13676; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
13677; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13678; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13679; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13680; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
13681; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
13682; GFX12-WGP-NEXT:    s_endpgm
13683;
13684; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
13685; GFX12-CU:       ; %bb.0: ; %entry
13686; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
13687; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
13688; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13689; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13690; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13691; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
13692; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
13693; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
13694; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
13695; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
13696; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13697; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
13698; GFX12-CU-NEXT:    s_wait_storecnt 0x0
13699; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
13700; GFX12-CU-NEXT:    s_endpgm
13701    ptr %out, i32 %in) {
13702entry:
13703  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
13704  ret void
13705}
13706
13707define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
13708; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13709; GFX7:       ; %bb.0: ; %entry
13710; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13711; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
13712; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13713; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13714; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13715; GFX7-NEXT:    v_mov_b32_e32 v2, s6
13716; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13717; GFX7-NEXT:    s_waitcnt vmcnt(0)
13718; GFX7-NEXT:    buffer_wbinvl1_vol
13719; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13720; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13722; GFX7-NEXT:    flat_store_dword v[0:1], v2
13723; GFX7-NEXT:    s_endpgm
13724;
13725; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13726; GFX10-WGP:       ; %bb.0: ; %entry
13727; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13728; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
13729; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13730; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13731; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13732; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
13733; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13734; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13735; GFX10-WGP-NEXT:    buffer_gl1_inv
13736; GFX10-WGP-NEXT:    buffer_gl0_inv
13737; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13738; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13739; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13740; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
13741; GFX10-WGP-NEXT:    s_endpgm
13742;
13743; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13744; GFX10-CU:       ; %bb.0: ; %entry
13745; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13746; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
13747; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13748; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13749; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13750; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
13751; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13752; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13753; GFX10-CU-NEXT:    buffer_gl1_inv
13754; GFX10-CU-NEXT:    buffer_gl0_inv
13755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13756; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13757; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13758; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
13759; GFX10-CU-NEXT:    s_endpgm
13760;
13761; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13762; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13763; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13764; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
13765; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13767; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13768; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
13769; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13770; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13771; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13772; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13773; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13774; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
13775; SKIP-CACHE-INV-NEXT:    s_endpgm
13776;
13777; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13778; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13779; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13780; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
13781; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13782; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13783; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
13784; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13785; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13786; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
13787; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
13788; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13789; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13790; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
13791; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13792;
13793; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13794; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13795; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13796; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
13797; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13798; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13799; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
13800; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13801; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13802; GFX90A-TGSPLIT-NEXT:    buffer_invl2
13803; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
13804; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13805; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
13806; GFX90A-TGSPLIT-NEXT:    s_endpgm
13807;
13808; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13809; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13810; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13811; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
13812; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13813; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13814; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
13815; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
13816; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13817; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
13818; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13819; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13820; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
13821; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13822;
13823; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13824; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13825; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13826; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
13827; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13828; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13829; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
13830; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
13831; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13832; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
13833; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13834; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
13835; GFX940-TGSPLIT-NEXT:    s_endpgm
13836;
13837; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13838; GFX11-WGP:       ; %bb.0: ; %entry
13839; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13840; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
13841; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13842; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13843; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13844; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
13845; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
13846; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
13847; GFX11-WGP-NEXT:    buffer_gl1_inv
13848; GFX11-WGP-NEXT:    buffer_gl0_inv
13849; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13850; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13851; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13852; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
13853; GFX11-WGP-NEXT:    s_endpgm
13854;
13855; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13856; GFX11-CU:       ; %bb.0: ; %entry
13857; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13858; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
13859; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13860; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13861; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13862; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
13863; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
13864; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
13865; GFX11-CU-NEXT:    buffer_gl1_inv
13866; GFX11-CU-NEXT:    buffer_gl0_inv
13867; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13868; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13869; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13870; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
13871; GFX11-CU-NEXT:    s_endpgm
13872;
13873; GFX12-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13874; GFX12-WGP:       ; %bb.0: ; %entry
13875; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13876; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
13877; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13878; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13879; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13880; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
13881; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13882; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
13883; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
13884; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13885; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13886; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
13887; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
13888; GFX12-WGP-NEXT:    s_endpgm
13889;
13890; GFX12-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
13891; GFX12-CU:       ; %bb.0: ; %entry
13892; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13893; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
13894; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13895; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13896; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13897; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
13898; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13899; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
13900; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
13901; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13902; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13903; GFX12-CU-NEXT:    s_wait_dscnt 0x0
13904; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
13905; GFX12-CU-NEXT:    s_endpgm
13906    ptr %out, i32 %in) {
13907entry:
13908  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
13909  store i32 %val, ptr %out, align 4
13910  ret void
13911}
13912
13913define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
13914; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
13915; GFX7:       ; %bb.0: ; %entry
13916; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13917; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
13918; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13919; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13920; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13921; GFX7-NEXT:    v_mov_b32_e32 v2, s6
13922; GFX7-NEXT:    s_waitcnt vmcnt(0)
13923; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13924; GFX7-NEXT:    s_waitcnt vmcnt(0)
13925; GFX7-NEXT:    buffer_wbinvl1_vol
13926; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13927; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13928; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13929; GFX7-NEXT:    flat_store_dword v[0:1], v2
13930; GFX7-NEXT:    s_endpgm
13931;
13932; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
13933; GFX10-WGP:       ; %bb.0: ; %entry
13934; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13935; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
13936; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13937; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13938; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13939; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
13940; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13941; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
13942; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13943; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
13944; GFX10-WGP-NEXT:    buffer_gl1_inv
13945; GFX10-WGP-NEXT:    buffer_gl0_inv
13946; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13947; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13948; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13949; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
13950; GFX10-WGP-NEXT:    s_endpgm
13951;
13952; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
13953; GFX10-CU:       ; %bb.0: ; %entry
13954; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13955; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
13956; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13957; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13958; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13959; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
13960; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13961; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
13962; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13963; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
13964; GFX10-CU-NEXT:    buffer_gl1_inv
13965; GFX10-CU-NEXT:    buffer_gl0_inv
13966; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13967; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13968; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13969; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
13970; GFX10-CU-NEXT:    s_endpgm
13971;
13972; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
13973; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13974; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13975; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
13976; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13977; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13978; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13979; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
13980; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13981; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13982; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
13983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13985; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13986; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
13987; SKIP-CACHE-INV-NEXT:    s_endpgm
13988;
13989; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
13990; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13991; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13992; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
13993; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13994; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13995; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
13996; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
13997; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
13998; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
13999; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14000; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
14001; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
14002; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14003; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14004; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
14005; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14006;
14007; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14008; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14009; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14010; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
14011; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14012; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14013; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
14014; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
14015; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14016; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14017; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14018; GFX90A-TGSPLIT-NEXT:    buffer_invl2
14019; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14020; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14021; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
14022; GFX90A-TGSPLIT-NEXT:    s_endpgm
14023;
14024; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14025; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14026; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14027; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
14028; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14029; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14030; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
14031; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
14032; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14033; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
14034; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14035; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
14036; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14037; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14038; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
14039; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14040;
14041; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14042; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14043; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14044; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
14045; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14046; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14047; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
14048; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
14049; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14050; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
14051; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14052; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
14053; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14054; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
14055; GFX940-TGSPLIT-NEXT:    s_endpgm
14056;
14057; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14058; GFX11-WGP:       ; %bb.0: ; %entry
14059; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14060; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
14061; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14062; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14063; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14064; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
14065; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14066; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14067; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
14068; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14069; GFX11-WGP-NEXT:    buffer_gl1_inv
14070; GFX11-WGP-NEXT:    buffer_gl0_inv
14071; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14072; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14073; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14074; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
14075; GFX11-WGP-NEXT:    s_endpgm
14076;
14077; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14078; GFX11-CU:       ; %bb.0: ; %entry
14079; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14080; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
14081; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14082; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14083; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14084; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
14085; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
14086; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14087; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
14088; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
14089; GFX11-CU-NEXT:    buffer_gl1_inv
14090; GFX11-CU-NEXT:    buffer_gl0_inv
14091; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14092; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14093; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14094; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
14095; GFX11-CU-NEXT:    s_endpgm
14096;
14097; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14098; GFX12-WGP:       ; %bb.0: ; %entry
14099; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14100; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
14101; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14102; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14103; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14104; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
14105; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
14106; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14107; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14108; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14109; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14110; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14111; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14112; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14113; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14114; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
14115; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14116; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14117; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
14118; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
14119; GFX12-WGP-NEXT:    s_endpgm
14120;
14121; GFX12-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
14122; GFX12-CU:       ; %bb.0: ; %entry
14123; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14124; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
14125; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14126; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14127; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14128; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
14129; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
14130; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
14131; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
14132; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
14133; GFX12-CU-NEXT:    s_wait_storecnt 0x0
14134; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14135; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
14136; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
14137; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
14138; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
14139; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14140; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14141; GFX12-CU-NEXT:    s_wait_dscnt 0x0
14142; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
14143; GFX12-CU-NEXT:    s_endpgm
14144    ptr %out, i32 %in) {
14145entry:
14146  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
14147  store i32 %val, ptr %out, align 4
14148  ret void
14149}
14150
14151define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
14152; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14153; GFX7:       ; %bb.0: ; %entry
14154; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14155; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
14156; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14157; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14158; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14159; GFX7-NEXT:    v_mov_b32_e32 v2, s6
14160; GFX7-NEXT:    s_waitcnt vmcnt(0)
14161; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14162; GFX7-NEXT:    s_waitcnt vmcnt(0)
14163; GFX7-NEXT:    buffer_wbinvl1_vol
14164; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14165; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14166; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14167; GFX7-NEXT:    flat_store_dword v[0:1], v2
14168; GFX7-NEXT:    s_endpgm
14169;
14170; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14171; GFX10-WGP:       ; %bb.0: ; %entry
14172; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14173; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
14174; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14175; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14176; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14177; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
14178; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14179; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14180; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14181; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14182; GFX10-WGP-NEXT:    buffer_gl1_inv
14183; GFX10-WGP-NEXT:    buffer_gl0_inv
14184; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14185; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14186; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14187; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
14188; GFX10-WGP-NEXT:    s_endpgm
14189;
14190; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14191; GFX10-CU:       ; %bb.0: ; %entry
14192; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14193; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
14194; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14195; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14196; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14197; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
14198; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
14199; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14200; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14201; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
14202; GFX10-CU-NEXT:    buffer_gl1_inv
14203; GFX10-CU-NEXT:    buffer_gl0_inv
14204; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14205; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14206; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14207; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
14208; GFX10-CU-NEXT:    s_endpgm
14209;
14210; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14211; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14212; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14213; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
14214; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14217; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
14218; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
14219; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14220; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
14221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14223; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14224; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
14225; SKIP-CACHE-INV-NEXT:    s_endpgm
14226;
14227; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14228; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14229; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14230; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
14231; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14232; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
14234; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
14235; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14236; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14237; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14238; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
14239; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
14240; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14241; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14242; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
14243; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14244;
14245; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14246; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14247; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14248; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
14249; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14250; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14251; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
14252; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
14253; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14254; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
14255; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14256; GFX90A-TGSPLIT-NEXT:    buffer_invl2
14257; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14259; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
14260; GFX90A-TGSPLIT-NEXT:    s_endpgm
14261;
14262; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14263; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14264; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14265; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
14266; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14267; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14268; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
14269; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
14270; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14271; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
14272; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14273; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
14274; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14275; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14276; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
14277; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14278;
14279; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14280; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14281; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14282; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
14283; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14284; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14285; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
14286; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
14287; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14288; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
14289; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14290; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
14291; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14292; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
14293; GFX940-TGSPLIT-NEXT:    s_endpgm
14294;
14295; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14296; GFX11-WGP:       ; %bb.0: ; %entry
14297; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14298; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
14299; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14300; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14301; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14302; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
14303; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14304; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14305; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
14306; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
14307; GFX11-WGP-NEXT:    buffer_gl1_inv
14308; GFX11-WGP-NEXT:    buffer_gl0_inv
14309; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14310; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14311; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14312; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
14313; GFX11-WGP-NEXT:    s_endpgm
14314;
14315; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14316; GFX11-CU:       ; %bb.0: ; %entry
14317; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14318; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
14319; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14320; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14321; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14322; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
14323; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
14324; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14325; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
14326; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
14327; GFX11-CU-NEXT:    buffer_gl1_inv
14328; GFX11-CU-NEXT:    buffer_gl0_inv
14329; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14330; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14331; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14332; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
14333; GFX11-CU-NEXT:    s_endpgm
14334;
14335; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14336; GFX12-WGP:       ; %bb.0: ; %entry
14337; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14338; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
14339; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14340; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14341; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14342; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
14343; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
14344; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14345; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14346; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14347; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14348; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14349; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
14350; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
14351; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
14352; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
14353; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14354; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14355; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
14356; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
14357; GFX12-WGP-NEXT:    s_endpgm
14358;
14359; GFX12-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
14360; GFX12-CU:       ; %bb.0: ; %entry
14361; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14362; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
14363; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14364; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14365; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14366; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
14367; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
14368; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
14369; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
14370; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
14371; GFX12-CU-NEXT:    s_wait_storecnt 0x0
14372; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14373; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
14374; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
14375; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
14376; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
14377; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14378; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14379; GFX12-CU-NEXT:    s_wait_dscnt 0x0
14380; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
14381; GFX12-CU-NEXT:    s_endpgm
14382    ptr %out, i32 %in) {
14383entry:
14384  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
14385  store i32 %val, ptr %out, align 4
14386  ret void
14387}
14388
14389define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
14390; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14391; GFX7:       ; %bb.0: ; %entry
14392; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14393; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14394; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14395; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14396; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14397; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14398; GFX7-NEXT:    s_mov_b32 s4, s8
14399; GFX7-NEXT:    s_mov_b32 s5, s9
14400; GFX7-NEXT:    s_mov_b32 s9, s10
14401; GFX7-NEXT:    s_mov_b32 s8, s11
14402; GFX7-NEXT:    s_add_u32 s4, s4, s9
14403; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14404; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14405; GFX7-NEXT:    s_mov_b32 s5, s8
14406; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14407; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14408; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14409; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14410; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14411; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14412; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14413; GFX7-NEXT:    s_endpgm
14414;
14415; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14416; GFX10-WGP:       ; %bb.0: ; %entry
14417; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14418; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14419; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14420; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14421; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14422; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14423; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14424; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14425; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14426; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14427; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14428; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14429; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14430; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14431; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14432; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14433; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14434; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14435; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14436; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14437; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14438; GFX10-WGP-NEXT:    s_endpgm
14439;
14440; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14441; GFX10-CU:       ; %bb.0: ; %entry
14442; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14443; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14444; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14445; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14446; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14447; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14448; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14449; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14450; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14451; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14452; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14453; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14454; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14455; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14456; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14457; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14458; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14459; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14460; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14461; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14462; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14463; GFX10-CU-NEXT:    s_endpgm
14464;
14465; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14466; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14467; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14468; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14469; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14470; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14471; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14472; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14473; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14474; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14475; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14476; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14477; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14478; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14479; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14481; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14482; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14483; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14484; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14487; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14488; SKIP-CACHE-INV-NEXT:    s_endpgm
14489;
14490; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14491; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14492; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14493; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14494; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14495; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14496; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14497; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14498; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14499; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14500; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14501; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14502; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14503;
14504; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14505; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14507; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14508; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14511; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14512; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14513; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14514; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14515; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14516; GFX90A-TGSPLIT-NEXT:    s_endpgm
14517;
14518; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14519; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14520; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14521; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14522; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14523; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14524; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14525; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14526; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14527; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14528; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14529; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
14530; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14531;
14532; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14533; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14534; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14535; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14536; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14537; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14538; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14539; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14540; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14541; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14542; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14543; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
14544; GFX940-TGSPLIT-NEXT:    s_endpgm
14545;
14546; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14547; GFX11-WGP:       ; %bb.0: ; %entry
14548; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14549; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14550; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14551; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14552; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14553; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14554; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14555; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14556; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14557; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14558; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14559; GFX11-WGP-NEXT:    s_endpgm
14560;
14561; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14562; GFX11-CU:       ; %bb.0: ; %entry
14563; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14564; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14565; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14566; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14567; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14568; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14569; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14570; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14571; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14572; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14573; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14574; GFX11-CU-NEXT:    s_endpgm
14575;
14576; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14577; GFX12-WGP:       ; %bb.0: ; %entry
14578; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14579; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14580; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14581; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14582; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14583; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14584; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14585; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14586; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14587; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14588; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
14589; GFX12-WGP-NEXT:    s_endpgm
14590;
14591; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
14592; GFX12-CU:       ; %bb.0: ; %entry
14593; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14594; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14595; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14596; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14597; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14598; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14599; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14600; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14601; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14602; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14603; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
14604; GFX12-CU-NEXT:    s_endpgm
14605    ptr %out, i32 %in, i32 %old) {
14606entry:
14607  %gep = getelementptr i32, ptr %out, i32 4
14608  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
14609  ret void
14610}
14611
14612define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
14613; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14614; GFX7:       ; %bb.0: ; %entry
14615; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14616; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14617; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14618; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14619; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14620; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14621; GFX7-NEXT:    s_mov_b32 s4, s8
14622; GFX7-NEXT:    s_mov_b32 s5, s9
14623; GFX7-NEXT:    s_mov_b32 s9, s10
14624; GFX7-NEXT:    s_mov_b32 s8, s11
14625; GFX7-NEXT:    s_add_u32 s4, s4, s9
14626; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14627; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14628; GFX7-NEXT:    s_mov_b32 s5, s8
14629; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14630; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14631; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14632; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14633; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14634; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14635; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14636; GFX7-NEXT:    s_waitcnt vmcnt(0)
14637; GFX7-NEXT:    buffer_wbinvl1_vol
14638; GFX7-NEXT:    s_endpgm
14639;
14640; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14641; GFX10-WGP:       ; %bb.0: ; %entry
14642; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14643; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14644; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14645; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14646; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14647; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14648; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14649; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14650; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14651; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14652; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14653; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14654; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14655; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14656; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14657; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14658; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14659; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14660; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14661; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14662; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14663; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14664; GFX10-WGP-NEXT:    buffer_gl1_inv
14665; GFX10-WGP-NEXT:    buffer_gl0_inv
14666; GFX10-WGP-NEXT:    s_endpgm
14667;
14668; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14669; GFX10-CU:       ; %bb.0: ; %entry
14670; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14671; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14672; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14673; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14674; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14675; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14676; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14677; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14678; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14679; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14680; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14681; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14682; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14683; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14684; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14685; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14686; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14687; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14688; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14689; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14690; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14691; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14692; GFX10-CU-NEXT:    buffer_gl1_inv
14693; GFX10-CU-NEXT:    buffer_gl0_inv
14694; GFX10-CU-NEXT:    s_endpgm
14695;
14696; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14697; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14698; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14699; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14700; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14701; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14702; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14703; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14704; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14705; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14706; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14707; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14708; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14709; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14710; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14711; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14713; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14714; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14716; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14717; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14718; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14719; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
14720; SKIP-CACHE-INV-NEXT:    s_endpgm
14721;
14722; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14723; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14724; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14725; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14726; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14727; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14728; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14729; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14730; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14731; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14732; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14733; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14734; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14735; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
14736; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
14737; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14738;
14739; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14740; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14741; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14742; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14743; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14744; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14745; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14746; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14747; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14748; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14749; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14750; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14751; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14752; GFX90A-TGSPLIT-NEXT:    buffer_invl2
14753; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
14754; GFX90A-TGSPLIT-NEXT:    s_endpgm
14755;
14756; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14757; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14758; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14759; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14760; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14761; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14762; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14763; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14764; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14765; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14766; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14767; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
14768; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14769; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
14770; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14771;
14772; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14773; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14774; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14775; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14776; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14777; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14778; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14779; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14780; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14781; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14782; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14783; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
14784; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14785; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
14786; GFX940-TGSPLIT-NEXT:    s_endpgm
14787;
14788; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14789; GFX11-WGP:       ; %bb.0: ; %entry
14790; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14791; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14792; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14793; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14794; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14795; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14796; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14797; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14798; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14799; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14800; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14801; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14802; GFX11-WGP-NEXT:    buffer_gl1_inv
14803; GFX11-WGP-NEXT:    buffer_gl0_inv
14804; GFX11-WGP-NEXT:    s_endpgm
14805;
14806; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14807; GFX11-CU:       ; %bb.0: ; %entry
14808; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14809; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14810; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14811; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14812; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14813; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14814; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14815; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14816; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14817; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14818; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14819; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14820; GFX11-CU-NEXT:    buffer_gl1_inv
14821; GFX11-CU-NEXT:    buffer_gl0_inv
14822; GFX11-CU-NEXT:    s_endpgm
14823;
14824; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14825; GFX12-WGP:       ; %bb.0: ; %entry
14826; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14827; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14828; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14829; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14830; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14831; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14832; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14833; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14834; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14835; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14836; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
14837; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
14838; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
14839; GFX12-WGP-NEXT:    s_endpgm
14840;
14841; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
14842; GFX12-CU:       ; %bb.0: ; %entry
14843; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14844; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14845; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14846; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14847; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14848; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14849; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14850; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14851; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14852; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14853; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
14854; GFX12-CU-NEXT:    s_wait_storecnt 0x0
14855; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
14856; GFX12-CU-NEXT:    s_endpgm
14857    ptr %out, i32 %in, i32 %old) {
14858entry:
14859  %gep = getelementptr i32, ptr %out, i32 4
14860  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
14861  ret void
14862}
14863
14864define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
14865; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14866; GFX7:       ; %bb.0: ; %entry
14867; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14868; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14869; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14870; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14871; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14872; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14873; GFX7-NEXT:    s_mov_b32 s4, s8
14874; GFX7-NEXT:    s_mov_b32 s5, s9
14875; GFX7-NEXT:    s_mov_b32 s9, s10
14876; GFX7-NEXT:    s_mov_b32 s8, s11
14877; GFX7-NEXT:    s_add_u32 s4, s4, s9
14878; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14879; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14880; GFX7-NEXT:    s_mov_b32 s5, s8
14881; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14882; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14883; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14884; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14885; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14886; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14887; GFX7-NEXT:    s_waitcnt vmcnt(0)
14888; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14889; GFX7-NEXT:    s_endpgm
14890;
14891; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14892; GFX10-WGP:       ; %bb.0: ; %entry
14893; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14894; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14895; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14896; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14897; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14898; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14899; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14900; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14901; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14902; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14903; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14904; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14905; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14906; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14907; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14908; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14909; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14910; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14911; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14912; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14913; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
14914; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
14915; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14916; GFX10-WGP-NEXT:    s_endpgm
14917;
14918; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14919; GFX10-CU:       ; %bb.0: ; %entry
14920; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14921; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14922; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14923; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14924; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14925; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14926; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14927; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14928; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14929; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14930; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14931; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14932; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14933; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14934; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14935; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14936; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14937; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14938; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14939; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14940; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
14941; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
14942; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14943; GFX10-CU-NEXT:    s_endpgm
14944;
14945; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14946; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14947; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14948; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14949; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14950; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14951; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14952; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14953; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14954; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14955; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14957; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14958; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14959; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14960; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14961; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14962; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14963; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14967; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
14968; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14969; SKIP-CACHE-INV-NEXT:    s_endpgm
14970;
14971; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14972; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14973; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14974; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14975; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14976; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14977; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14978; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14979; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14980; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14981; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14982; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
14983; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
14984; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14985; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14986;
14987; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
14988; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14989; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14990; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14991; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14992; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14993; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14994; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14995; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14996; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14997; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14998; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
14999; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15000; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15001; GFX90A-TGSPLIT-NEXT:    s_endpgm
15002;
15003; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15004; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15005; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15006; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15007; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15008; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15009; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15010; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15011; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15012; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15013; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15014; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15015; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15016; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15017; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15018;
15019; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15020; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15021; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15022; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15023; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15024; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15025; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15026; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15027; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15028; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15029; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15030; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15031; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15032; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15033; GFX940-TGSPLIT-NEXT:    s_endpgm
15034;
15035; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15036; GFX11-WGP:       ; %bb.0: ; %entry
15037; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15038; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15039; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15040; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15041; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15042; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15043; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15044; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15045; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15046; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15047; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15048; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15049; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15050; GFX11-WGP-NEXT:    s_endpgm
15051;
15052; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15053; GFX11-CU:       ; %bb.0: ; %entry
15054; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15055; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15056; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15057; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15058; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15059; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15060; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15061; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15062; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15063; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15064; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
15065; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15066; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15067; GFX11-CU-NEXT:    s_endpgm
15068;
15069; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15070; GFX12-WGP:       ; %bb.0: ; %entry
15071; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15072; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15073; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15074; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15075; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15076; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15077; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15078; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15079; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15080; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15081; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
15082; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15083; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15084; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15085; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15086; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15087; GFX12-WGP-NEXT:    s_endpgm
15088;
15089; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
15090; GFX12-CU:       ; %bb.0: ; %entry
15091; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15092; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15093; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15094; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15095; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15096; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15097; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15098; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15099; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15100; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15101; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
15102; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
15103; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
15104; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
15105; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15106; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15107; GFX12-CU-NEXT:    s_endpgm
15108    ptr %out, i32 %in, i32 %old) {
15109entry:
15110  %gep = getelementptr i32, ptr %out, i32 4
15111  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
15112  ret void
15113}
15114
15115define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
15116; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15117; GFX7:       ; %bb.0: ; %entry
15118; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15119; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15120; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15121; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15122; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15123; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15124; GFX7-NEXT:    s_mov_b32 s4, s8
15125; GFX7-NEXT:    s_mov_b32 s5, s9
15126; GFX7-NEXT:    s_mov_b32 s9, s10
15127; GFX7-NEXT:    s_mov_b32 s8, s11
15128; GFX7-NEXT:    s_add_u32 s4, s4, s9
15129; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15130; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15131; GFX7-NEXT:    s_mov_b32 s5, s8
15132; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15133; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15134; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15135; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15136; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15137; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15138; GFX7-NEXT:    s_waitcnt vmcnt(0)
15139; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15140; GFX7-NEXT:    s_waitcnt vmcnt(0)
15141; GFX7-NEXT:    buffer_wbinvl1_vol
15142; GFX7-NEXT:    s_endpgm
15143;
15144; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15145; GFX10-WGP:       ; %bb.0: ; %entry
15146; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15147; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15148; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15149; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15150; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15151; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15152; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15153; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15154; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15155; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15156; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15157; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15158; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15159; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15160; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15161; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15162; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15163; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15164; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15165; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15166; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15167; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15168; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15169; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15170; GFX10-WGP-NEXT:    buffer_gl1_inv
15171; GFX10-WGP-NEXT:    buffer_gl0_inv
15172; GFX10-WGP-NEXT:    s_endpgm
15173;
15174; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15175; GFX10-CU:       ; %bb.0: ; %entry
15176; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15177; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15178; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15179; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15180; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15181; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15182; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15183; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15184; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15185; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15186; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15187; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15188; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15189; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15190; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15191; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15192; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15193; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15194; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15195; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15196; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
15197; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15198; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15199; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15200; GFX10-CU-NEXT:    buffer_gl1_inv
15201; GFX10-CU-NEXT:    buffer_gl0_inv
15202; GFX10-CU-NEXT:    s_endpgm
15203;
15204; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15205; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15206; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15207; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15208; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15209; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15210; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15211; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15212; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15213; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15214; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15215; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15216; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15217; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15218; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15220; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15222; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15226; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15227; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15228; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15229; SKIP-CACHE-INV-NEXT:    s_endpgm
15230;
15231; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15232; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15233; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15234; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15235; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15236; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15237; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15238; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15239; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15240; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15241; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15242; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
15243; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15244; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15246; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
15247; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
15248; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15249;
15250; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15251; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15252; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15253; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15254; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15256; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15258; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15259; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15260; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15261; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
15262; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15263; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15264; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15265; GFX90A-TGSPLIT-NEXT:    buffer_invl2
15266; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15267; GFX90A-TGSPLIT-NEXT:    s_endpgm
15268;
15269; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15270; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15271; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15272; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15273; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15274; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15275; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15276; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15277; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15278; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15279; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15280; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15281; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15282; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15283; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15284; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
15285; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15286;
15287; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15288; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15289; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15290; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15291; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15292; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15293; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15294; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15295; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15296; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15297; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15298; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15299; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15300; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15301; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15302; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
15303; GFX940-TGSPLIT-NEXT:    s_endpgm
15304;
15305; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15306; GFX11-WGP:       ; %bb.0: ; %entry
15307; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15308; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15309; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15310; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15311; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15312; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15313; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15314; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15315; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15316; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15317; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15318; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15319; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15320; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15321; GFX11-WGP-NEXT:    buffer_gl1_inv
15322; GFX11-WGP-NEXT:    buffer_gl0_inv
15323; GFX11-WGP-NEXT:    s_endpgm
15324;
15325; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15326; GFX11-CU:       ; %bb.0: ; %entry
15327; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15328; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15329; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15330; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15331; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15332; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15333; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15334; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15335; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15336; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15337; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
15338; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15339; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15340; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15341; GFX11-CU-NEXT:    buffer_gl1_inv
15342; GFX11-CU-NEXT:    buffer_gl0_inv
15343; GFX11-CU-NEXT:    s_endpgm
15344;
15345; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15346; GFX12-WGP:       ; %bb.0: ; %entry
15347; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15348; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15349; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15350; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15351; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15352; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15353; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15354; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15355; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15356; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15357; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
15358; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15359; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15360; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15361; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15362; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15363; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15364; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
15365; GFX12-WGP-NEXT:    s_endpgm
15366;
15367; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
15368; GFX12-CU:       ; %bb.0: ; %entry
15369; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15370; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15371; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15372; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15373; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15374; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15375; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15376; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15377; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15378; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15379; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
15380; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
15381; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
15382; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
15383; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15384; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15385; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15386; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
15387; GFX12-CU-NEXT:    s_endpgm
15388    ptr %out, i32 %in, i32 %old) {
15389entry:
15390  %gep = getelementptr i32, ptr %out, i32 4
15391  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
15392  ret void
15393}
15394
15395define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
15396; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15397; GFX7:       ; %bb.0: ; %entry
15398; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15399; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15400; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15401; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15402; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15403; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15404; GFX7-NEXT:    s_mov_b32 s4, s8
15405; GFX7-NEXT:    s_mov_b32 s5, s9
15406; GFX7-NEXT:    s_mov_b32 s9, s10
15407; GFX7-NEXT:    s_mov_b32 s8, s11
15408; GFX7-NEXT:    s_add_u32 s4, s4, s9
15409; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15410; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15411; GFX7-NEXT:    s_mov_b32 s5, s8
15412; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15413; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15414; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15415; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15416; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15417; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15418; GFX7-NEXT:    s_waitcnt vmcnt(0)
15419; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15420; GFX7-NEXT:    s_waitcnt vmcnt(0)
15421; GFX7-NEXT:    buffer_wbinvl1_vol
15422; GFX7-NEXT:    s_endpgm
15423;
15424; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15425; GFX10-WGP:       ; %bb.0: ; %entry
15426; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15427; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15428; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15429; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15430; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15431; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15432; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15433; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15434; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15435; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15436; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15437; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15438; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15439; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15440; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15441; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15442; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15443; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15444; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15445; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15446; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15447; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15448; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15449; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15450; GFX10-WGP-NEXT:    buffer_gl1_inv
15451; GFX10-WGP-NEXT:    buffer_gl0_inv
15452; GFX10-WGP-NEXT:    s_endpgm
15453;
15454; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15455; GFX10-CU:       ; %bb.0: ; %entry
15456; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15457; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15458; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15459; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15460; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15461; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15462; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15463; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15464; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15465; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15466; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15467; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15468; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15469; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15470; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15471; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15472; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15473; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15474; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15475; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15476; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
15477; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15478; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15479; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15480; GFX10-CU-NEXT:    buffer_gl1_inv
15481; GFX10-CU-NEXT:    buffer_gl0_inv
15482; GFX10-CU-NEXT:    s_endpgm
15483;
15484; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15485; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15486; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15487; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15488; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15489; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15490; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15491; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15492; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15493; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15494; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15495; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15496; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15497; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15498; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15499; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15500; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15501; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15502; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15503; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15504; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15505; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15506; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15507; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15508; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15509; SKIP-CACHE-INV-NEXT:    s_endpgm
15510;
15511; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15512; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15513; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15514; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15515; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15516; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15517; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15518; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15519; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15520; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15521; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15522; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
15523; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15524; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15525; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15526; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
15527; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
15528; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15529;
15530; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15531; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15532; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15533; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15534; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15535; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15536; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15537; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15538; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15539; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15540; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15541; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
15542; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15543; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15544; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15545; GFX90A-TGSPLIT-NEXT:    buffer_invl2
15546; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15547; GFX90A-TGSPLIT-NEXT:    s_endpgm
15548;
15549; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15550; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15551; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15552; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15553; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15554; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15555; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15556; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15557; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15558; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15559; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15560; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15561; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15562; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15563; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15564; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
15565; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15566;
15567; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15568; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15569; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15570; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15571; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15572; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15573; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15574; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15575; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15576; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15577; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15578; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
15579; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15580; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15581; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15582; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
15583; GFX940-TGSPLIT-NEXT:    s_endpgm
15584;
15585; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15586; GFX11-WGP:       ; %bb.0: ; %entry
15587; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15588; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15589; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15590; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15591; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15592; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15593; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15594; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15595; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15596; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15597; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15598; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15599; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15600; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15601; GFX11-WGP-NEXT:    buffer_gl1_inv
15602; GFX11-WGP-NEXT:    buffer_gl0_inv
15603; GFX11-WGP-NEXT:    s_endpgm
15604;
15605; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15606; GFX11-CU:       ; %bb.0: ; %entry
15607; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15608; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15609; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15610; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15611; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15612; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15613; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15614; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15615; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15616; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15617; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
15618; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15619; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15620; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15621; GFX11-CU-NEXT:    buffer_gl1_inv
15622; GFX11-CU-NEXT:    buffer_gl0_inv
15623; GFX11-CU-NEXT:    s_endpgm
15624;
15625; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15626; GFX12-WGP:       ; %bb.0: ; %entry
15627; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15628; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15629; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15630; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15631; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15632; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15633; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15634; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15635; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15636; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15637; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
15638; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
15639; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
15640; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15641; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15642; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15643; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15644; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
15645; GFX12-WGP-NEXT:    s_endpgm
15646;
15647; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
15648; GFX12-CU:       ; %bb.0: ; %entry
15649; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15650; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15651; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15652; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15653; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15654; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15655; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15656; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15657; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15658; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15659; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
15660; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
15661; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
15662; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
15663; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15664; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15665; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15666; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
15667; GFX12-CU-NEXT:    s_endpgm
15668    ptr %out, i32 %in, i32 %old) {
15669entry:
15670  %gep = getelementptr i32, ptr %out, i32 4
15671  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
15672  ret void
15673}
15674
15675define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
15676; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15677; GFX7:       ; %bb.0: ; %entry
15678; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15679; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15680; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15681; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15682; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15683; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15684; GFX7-NEXT:    s_mov_b32 s4, s8
15685; GFX7-NEXT:    s_mov_b32 s5, s9
15686; GFX7-NEXT:    s_mov_b32 s9, s10
15687; GFX7-NEXT:    s_mov_b32 s8, s11
15688; GFX7-NEXT:    s_add_u32 s4, s4, s9
15689; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15690; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15691; GFX7-NEXT:    s_mov_b32 s5, s8
15692; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15693; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15694; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15695; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15696; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15697; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15698; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15699; GFX7-NEXT:    s_waitcnt vmcnt(0)
15700; GFX7-NEXT:    buffer_wbinvl1_vol
15701; GFX7-NEXT:    s_endpgm
15702;
15703; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15704; GFX10-WGP:       ; %bb.0: ; %entry
15705; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15706; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15707; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15708; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15709; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15710; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15711; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15712; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15713; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15714; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15715; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15716; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15717; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15718; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15719; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15720; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15721; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15722; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15723; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15724; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15725; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15726; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15727; GFX10-WGP-NEXT:    buffer_gl1_inv
15728; GFX10-WGP-NEXT:    buffer_gl0_inv
15729; GFX10-WGP-NEXT:    s_endpgm
15730;
15731; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15732; GFX10-CU:       ; %bb.0: ; %entry
15733; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15734; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15735; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15736; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15737; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15738; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15739; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15740; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15741; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15742; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15743; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15744; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15745; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15746; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15747; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15748; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15749; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15750; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15751; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15752; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15753; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15754; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15755; GFX10-CU-NEXT:    buffer_gl1_inv
15756; GFX10-CU-NEXT:    buffer_gl0_inv
15757; GFX10-CU-NEXT:    s_endpgm
15758;
15759; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15760; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15761; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15762; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15763; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15764; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15765; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15766; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15767; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15768; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15769; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15770; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15771; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15772; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15773; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15774; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15775; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15776; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15777; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15779; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15781; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15782; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15783; SKIP-CACHE-INV-NEXT:    s_endpgm
15784;
15785; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15786; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15787; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15788; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15789; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15790; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15791; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15792; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15793; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15794; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15795; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15796; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15797; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15798; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
15799; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
15800; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15801;
15802; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15803; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15804; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15805; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15806; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15807; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15808; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15809; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15810; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15811; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15812; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15813; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15814; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15815; GFX90A-TGSPLIT-NEXT:    buffer_invl2
15816; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
15817; GFX90A-TGSPLIT-NEXT:    s_endpgm
15818;
15819; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15820; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15821; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15822; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15823; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15824; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15825; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15826; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15827; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15828; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15829; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15830; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15831; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15832; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
15833; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15834;
15835; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15836; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15837; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15838; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15839; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15840; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15841; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15842; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15843; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15844; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15845; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15846; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
15847; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15848; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
15849; GFX940-TGSPLIT-NEXT:    s_endpgm
15850;
15851; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15852; GFX11-WGP:       ; %bb.0: ; %entry
15853; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15854; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15855; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15856; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15857; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15858; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15859; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15860; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15861; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15862; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15863; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15864; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15865; GFX11-WGP-NEXT:    buffer_gl1_inv
15866; GFX11-WGP-NEXT:    buffer_gl0_inv
15867; GFX11-WGP-NEXT:    s_endpgm
15868;
15869; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15870; GFX11-CU:       ; %bb.0: ; %entry
15871; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15872; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15873; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15874; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15875; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15876; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15877; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15878; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15879; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15880; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15881; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15882; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
15883; GFX11-CU-NEXT:    buffer_gl1_inv
15884; GFX11-CU-NEXT:    buffer_gl0_inv
15885; GFX11-CU-NEXT:    s_endpgm
15886;
15887; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15888; GFX12-WGP:       ; %bb.0: ; %entry
15889; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15890; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15891; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15892; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15893; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15894; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15895; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15896; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15897; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15898; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15899; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15900; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
15901; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
15902; GFX12-WGP-NEXT:    s_endpgm
15903;
15904; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
15905; GFX12-CU:       ; %bb.0: ; %entry
15906; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15907; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15908; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15909; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15910; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15911; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15912; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15913; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15914; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15915; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15916; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
15917; GFX12-CU-NEXT:    s_wait_storecnt 0x0
15918; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
15919; GFX12-CU-NEXT:    s_endpgm
15920    ptr %out, i32 %in, i32 %old) {
15921entry:
15922  %gep = getelementptr i32, ptr %out, i32 4
15923  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
15924  ret void
15925}
15926
15927define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
15928; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
15929; GFX7:       ; %bb.0: ; %entry
15930; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15931; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15932; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15933; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15934; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15935; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15936; GFX7-NEXT:    s_mov_b32 s4, s8
15937; GFX7-NEXT:    s_mov_b32 s5, s9
15938; GFX7-NEXT:    s_mov_b32 s9, s10
15939; GFX7-NEXT:    s_mov_b32 s8, s11
15940; GFX7-NEXT:    s_add_u32 s4, s4, s9
15941; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15942; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15943; GFX7-NEXT:    s_mov_b32 s5, s8
15944; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15945; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15946; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15947; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15948; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15949; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15950; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15951; GFX7-NEXT:    s_waitcnt vmcnt(0)
15952; GFX7-NEXT:    buffer_wbinvl1_vol
15953; GFX7-NEXT:    s_endpgm
15954;
15955; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
15956; GFX10-WGP:       ; %bb.0: ; %entry
15957; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15958; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15959; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15960; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15961; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15962; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15963; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15964; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15965; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15966; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15967; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15968; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15969; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15970; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15971; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15972; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15973; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15974; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15975; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15976; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15977; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15978; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
15979; GFX10-WGP-NEXT:    buffer_gl1_inv
15980; GFX10-WGP-NEXT:    buffer_gl0_inv
15981; GFX10-WGP-NEXT:    s_endpgm
15982;
15983; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
15984; GFX10-CU:       ; %bb.0: ; %entry
15985; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15986; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15987; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15988; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15989; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15990; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15991; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15992; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15993; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15994; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15995; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15996; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15997; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15998; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15999; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
16000; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16001; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16002; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16003; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16004; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16005; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16006; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16007; GFX10-CU-NEXT:    buffer_gl1_inv
16008; GFX10-CU-NEXT:    buffer_gl0_inv
16009; GFX10-CU-NEXT:    s_endpgm
16010;
16011; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16012; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16013; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
16014; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
16015; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
16016; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
16017; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
16018; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16019; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
16020; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
16021; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
16022; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
16023; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
16024; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
16025; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
16026; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
16027; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
16028; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16029; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16030; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16031; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16032; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16033; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16034; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16035; SKIP-CACHE-INV-NEXT:    s_endpgm
16036;
16037; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16038; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16039; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16040; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16041; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16042; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16043; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16044; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16045; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16046; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16047; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16048; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16049; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16050; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
16051; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
16052; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16053;
16054; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16055; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16056; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16057; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16058; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16059; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16060; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16061; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16062; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16063; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16064; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16065; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16066; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16067; GFX90A-TGSPLIT-NEXT:    buffer_invl2
16068; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16069; GFX90A-TGSPLIT-NEXT:    s_endpgm
16070;
16071; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16072; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16073; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16074; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16075; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16076; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16077; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16078; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16079; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16080; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16081; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16082; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16083; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16084; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
16085; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16086;
16087; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16088; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16089; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16090; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16091; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16092; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16093; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16094; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16095; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16096; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16097; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16098; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16099; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16100; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
16101; GFX940-TGSPLIT-NEXT:    s_endpgm
16102;
16103; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16104; GFX11-WGP:       ; %bb.0: ; %entry
16105; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16106; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16107; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16108; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16109; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16110; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16111; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16112; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16113; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16114; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16115; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16116; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16117; GFX11-WGP-NEXT:    buffer_gl1_inv
16118; GFX11-WGP-NEXT:    buffer_gl0_inv
16119; GFX11-WGP-NEXT:    s_endpgm
16120;
16121; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16122; GFX11-CU:       ; %bb.0: ; %entry
16123; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16124; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16125; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16126; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16127; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16128; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16129; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16130; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16131; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16132; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16133; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16134; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16135; GFX11-CU-NEXT:    buffer_gl1_inv
16136; GFX11-CU-NEXT:    buffer_gl0_inv
16137; GFX11-CU-NEXT:    s_endpgm
16138;
16139; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16140; GFX12-WGP:       ; %bb.0: ; %entry
16141; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16142; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16143; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16144; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16145; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16146; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16147; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16148; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16149; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16150; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16151; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16152; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16153; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
16154; GFX12-WGP-NEXT:    s_endpgm
16155;
16156; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
16157; GFX12-CU:       ; %bb.0: ; %entry
16158; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16159; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16160; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16161; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16162; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16163; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16164; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16165; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16166; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16167; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16168; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16169; GFX12-CU-NEXT:    s_wait_storecnt 0x0
16170; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
16171; GFX12-CU-NEXT:    s_endpgm
16172    ptr %out, i32 %in, i32 %old) {
16173entry:
16174  %gep = getelementptr i32, ptr %out, i32 4
16175  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
16176  ret void
16177}
16178
16179define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
16180; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16181; GFX7:       ; %bb.0: ; %entry
16182; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
16183; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16184; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
16185; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
16186; GFX7-NEXT:    s_mov_b64 s[10:11], 16
16187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16188; GFX7-NEXT:    s_mov_b32 s4, s8
16189; GFX7-NEXT:    s_mov_b32 s5, s9
16190; GFX7-NEXT:    s_mov_b32 s9, s10
16191; GFX7-NEXT:    s_mov_b32 s8, s11
16192; GFX7-NEXT:    s_add_u32 s4, s4, s9
16193; GFX7-NEXT:    s_addc_u32 s8, s5, s8
16194; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16195; GFX7-NEXT:    s_mov_b32 s5, s8
16196; GFX7-NEXT:    v_mov_b32_e32 v2, s7
16197; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16198; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16199; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16200; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16201; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16202; GFX7-NEXT:    s_waitcnt vmcnt(0)
16203; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16204; GFX7-NEXT:    s_waitcnt vmcnt(0)
16205; GFX7-NEXT:    buffer_wbinvl1_vol
16206; GFX7-NEXT:    s_endpgm
16207;
16208; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16209; GFX10-WGP:       ; %bb.0: ; %entry
16210; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
16211; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16212; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
16213; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
16214; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
16215; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16216; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
16217; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
16218; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
16219; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
16220; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
16221; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
16222; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16223; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
16224; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
16225; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16226; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16227; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16228; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16229; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16230; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16231; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16232; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16233; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16234; GFX10-WGP-NEXT:    buffer_gl1_inv
16235; GFX10-WGP-NEXT:    buffer_gl0_inv
16236; GFX10-WGP-NEXT:    s_endpgm
16237;
16238; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16239; GFX10-CU:       ; %bb.0: ; %entry
16240; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
16241; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16242; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
16243; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
16244; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
16245; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16246; GFX10-CU-NEXT:    s_mov_b32 s4, s8
16247; GFX10-CU-NEXT:    s_mov_b32 s5, s9
16248; GFX10-CU-NEXT:    s_mov_b32 s9, s10
16249; GFX10-CU-NEXT:    s_mov_b32 s8, s11
16250; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
16251; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
16252; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16253; GFX10-CU-NEXT:    s_mov_b32 s5, s8
16254; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
16255; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16256; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16257; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16258; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16259; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16260; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16261; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16262; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16263; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16264; GFX10-CU-NEXT:    buffer_gl1_inv
16265; GFX10-CU-NEXT:    buffer_gl0_inv
16266; GFX10-CU-NEXT:    s_endpgm
16267;
16268; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16269; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16270; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
16271; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
16272; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
16273; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
16274; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
16275; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16276; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
16277; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
16278; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
16279; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
16280; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
16281; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
16282; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
16283; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
16284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
16285; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16286; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16287; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16288; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16289; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16290; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16291; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16292; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16293; SKIP-CACHE-INV-NEXT:    s_endpgm
16294;
16295; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16296; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16297; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16298; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16299; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16300; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16301; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16302; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16303; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16304; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16305; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16306; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
16307; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16308; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16309; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16310; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
16311; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
16312; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16313;
16314; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16315; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16316; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16317; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16318; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16319; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16320; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16321; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16322; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16323; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16324; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16325; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
16326; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16327; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16328; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16329; GFX90A-TGSPLIT-NEXT:    buffer_invl2
16330; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16331; GFX90A-TGSPLIT-NEXT:    s_endpgm
16332;
16333; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16334; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16335; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16336; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16337; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16338; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16339; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16340; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16341; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16342; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16343; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16344; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16345; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16346; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16347; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16348; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
16349; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16350;
16351; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16352; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16353; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16354; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16355; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16356; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16357; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16358; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16359; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16360; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16361; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16362; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16363; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16364; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16365; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16366; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
16367; GFX940-TGSPLIT-NEXT:    s_endpgm
16368;
16369; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16370; GFX11-WGP:       ; %bb.0: ; %entry
16371; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16372; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16373; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16374; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16375; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16376; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16377; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16378; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16379; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16380; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16381; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16382; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16383; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16384; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16385; GFX11-WGP-NEXT:    buffer_gl1_inv
16386; GFX11-WGP-NEXT:    buffer_gl0_inv
16387; GFX11-WGP-NEXT:    s_endpgm
16388;
16389; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16390; GFX11-CU:       ; %bb.0: ; %entry
16391; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16392; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16393; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16394; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16395; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16396; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16397; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16398; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16399; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16400; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16401; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16402; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16403; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16404; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16405; GFX11-CU-NEXT:    buffer_gl1_inv
16406; GFX11-CU-NEXT:    buffer_gl0_inv
16407; GFX11-CU-NEXT:    s_endpgm
16408;
16409; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16410; GFX12-WGP:       ; %bb.0: ; %entry
16411; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16412; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16413; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16414; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16415; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16416; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16417; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16418; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16419; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16420; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16421; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
16422; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
16423; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
16424; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16425; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16426; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16427; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16428; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
16429; GFX12-WGP-NEXT:    s_endpgm
16430;
16431; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
16432; GFX12-CU:       ; %bb.0: ; %entry
16433; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16434; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16435; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16436; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16437; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16438; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16439; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16440; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16441; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16442; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16443; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
16444; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
16445; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
16446; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16447; GFX12-CU-NEXT:    s_wait_storecnt 0x0
16448; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16449; GFX12-CU-NEXT:    s_wait_storecnt 0x0
16450; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
16451; GFX12-CU-NEXT:    s_endpgm
16452    ptr %out, i32 %in, i32 %old) {
16453entry:
16454  %gep = getelementptr i32, ptr %out, i32 4
16455  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
16456  ret void
16457}
16458
16459define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
16460; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16461; GFX7:       ; %bb.0: ; %entry
16462; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
16463; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16464; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
16465; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
16466; GFX7-NEXT:    s_mov_b64 s[10:11], 16
16467; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16468; GFX7-NEXT:    s_mov_b32 s4, s8
16469; GFX7-NEXT:    s_mov_b32 s5, s9
16470; GFX7-NEXT:    s_mov_b32 s9, s10
16471; GFX7-NEXT:    s_mov_b32 s8, s11
16472; GFX7-NEXT:    s_add_u32 s4, s4, s9
16473; GFX7-NEXT:    s_addc_u32 s8, s5, s8
16474; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16475; GFX7-NEXT:    s_mov_b32 s5, s8
16476; GFX7-NEXT:    v_mov_b32_e32 v2, s7
16477; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16478; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16479; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16480; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16481; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16482; GFX7-NEXT:    s_waitcnt vmcnt(0)
16483; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16484; GFX7-NEXT:    s_waitcnt vmcnt(0)
16485; GFX7-NEXT:    buffer_wbinvl1_vol
16486; GFX7-NEXT:    s_endpgm
16487;
16488; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16489; GFX10-WGP:       ; %bb.0: ; %entry
16490; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
16491; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16492; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
16493; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
16494; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
16495; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16496; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
16497; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
16498; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
16499; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
16500; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
16501; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
16502; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16503; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
16504; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
16505; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16506; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16507; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16508; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16509; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16510; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16511; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16512; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16513; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16514; GFX10-WGP-NEXT:    buffer_gl1_inv
16515; GFX10-WGP-NEXT:    buffer_gl0_inv
16516; GFX10-WGP-NEXT:    s_endpgm
16517;
16518; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16519; GFX10-CU:       ; %bb.0: ; %entry
16520; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
16521; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16522; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
16523; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
16524; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
16525; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16526; GFX10-CU-NEXT:    s_mov_b32 s4, s8
16527; GFX10-CU-NEXT:    s_mov_b32 s5, s9
16528; GFX10-CU-NEXT:    s_mov_b32 s9, s10
16529; GFX10-CU-NEXT:    s_mov_b32 s8, s11
16530; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
16531; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
16532; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16533; GFX10-CU-NEXT:    s_mov_b32 s5, s8
16534; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
16535; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16536; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16537; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16538; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16539; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16540; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16541; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16542; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16543; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16544; GFX10-CU-NEXT:    buffer_gl1_inv
16545; GFX10-CU-NEXT:    buffer_gl0_inv
16546; GFX10-CU-NEXT:    s_endpgm
16547;
16548; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16549; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16550; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
16551; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
16552; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
16553; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
16554; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
16555; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16556; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
16557; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
16558; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
16559; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
16560; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
16561; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
16562; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
16563; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
16564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
16565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16566; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16569; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16570; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16571; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16572; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16573; SKIP-CACHE-INV-NEXT:    s_endpgm
16574;
16575; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16576; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16577; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16578; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16579; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16581; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16582; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16583; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16584; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16585; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16586; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
16587; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16588; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16589; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16590; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
16591; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
16592; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16593;
16594; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16595; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16596; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16597; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16598; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16599; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16600; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16601; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16602; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16603; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16604; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16605; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
16606; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16607; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16608; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16609; GFX90A-TGSPLIT-NEXT:    buffer_invl2
16610; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16611; GFX90A-TGSPLIT-NEXT:    s_endpgm
16612;
16613; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16614; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16615; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16616; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16617; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16618; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16619; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16620; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16621; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16622; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16623; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16624; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16625; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16626; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16627; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16628; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
16629; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16630;
16631; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16632; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16633; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16634; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16635; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16636; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16637; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16638; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16639; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16640; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16641; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16642; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16643; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16644; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16645; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16646; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
16647; GFX940-TGSPLIT-NEXT:    s_endpgm
16648;
16649; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16650; GFX11-WGP:       ; %bb.0: ; %entry
16651; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16652; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16653; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16654; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16655; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16656; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16657; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16658; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16659; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16660; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16661; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16662; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16663; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16664; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16665; GFX11-WGP-NEXT:    buffer_gl1_inv
16666; GFX11-WGP-NEXT:    buffer_gl0_inv
16667; GFX11-WGP-NEXT:    s_endpgm
16668;
16669; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16670; GFX11-CU:       ; %bb.0: ; %entry
16671; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16672; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16673; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16674; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16675; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16676; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16677; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16678; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16679; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16680; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16681; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16682; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16683; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16684; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16685; GFX11-CU-NEXT:    buffer_gl1_inv
16686; GFX11-CU-NEXT:    buffer_gl0_inv
16687; GFX11-CU-NEXT:    s_endpgm
16688;
16689; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16690; GFX12-WGP:       ; %bb.0: ; %entry
16691; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16692; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16693; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16694; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16695; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16696; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16697; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16698; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16699; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16700; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16701; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
16702; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
16703; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
16704; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16705; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16706; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16707; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16708; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
16709; GFX12-WGP-NEXT:    s_endpgm
16710;
16711; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
16712; GFX12-CU:       ; %bb.0: ; %entry
16713; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16714; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16715; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16716; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16717; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16718; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16719; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16720; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16721; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16722; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16723; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
16724; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
16725; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
16726; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16727; GFX12-CU-NEXT:    s_wait_storecnt 0x0
16728; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16729; GFX12-CU-NEXT:    s_wait_storecnt 0x0
16730; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
16731; GFX12-CU-NEXT:    s_endpgm
16732    ptr %out, i32 %in, i32 %old) {
16733entry:
16734  %gep = getelementptr i32, ptr %out, i32 4
16735  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
16736  ret void
16737}
16738
16739define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
16740; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16741; GFX7:       ; %bb.0: ; %entry
16742; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
16743; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16744; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
16745; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
16746; GFX7-NEXT:    s_mov_b64 s[10:11], 16
16747; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16748; GFX7-NEXT:    s_mov_b32 s4, s8
16749; GFX7-NEXT:    s_mov_b32 s5, s9
16750; GFX7-NEXT:    s_mov_b32 s9, s10
16751; GFX7-NEXT:    s_mov_b32 s8, s11
16752; GFX7-NEXT:    s_add_u32 s4, s4, s9
16753; GFX7-NEXT:    s_addc_u32 s8, s5, s8
16754; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16755; GFX7-NEXT:    s_mov_b32 s5, s8
16756; GFX7-NEXT:    v_mov_b32_e32 v2, s7
16757; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16758; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16759; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16760; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16761; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16762; GFX7-NEXT:    s_waitcnt vmcnt(0)
16763; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16764; GFX7-NEXT:    s_waitcnt vmcnt(0)
16765; GFX7-NEXT:    buffer_wbinvl1_vol
16766; GFX7-NEXT:    s_endpgm
16767;
16768; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16769; GFX10-WGP:       ; %bb.0: ; %entry
16770; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
16771; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16772; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
16773; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
16774; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
16775; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16776; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
16777; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
16778; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
16779; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
16780; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
16781; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
16782; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16783; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
16784; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
16785; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16786; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16787; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16788; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16789; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16790; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16791; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16792; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16793; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16794; GFX10-WGP-NEXT:    buffer_gl1_inv
16795; GFX10-WGP-NEXT:    buffer_gl0_inv
16796; GFX10-WGP-NEXT:    s_endpgm
16797;
16798; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16799; GFX10-CU:       ; %bb.0: ; %entry
16800; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
16801; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
16802; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
16803; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
16804; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
16805; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16806; GFX10-CU-NEXT:    s_mov_b32 s4, s8
16807; GFX10-CU-NEXT:    s_mov_b32 s5, s9
16808; GFX10-CU-NEXT:    s_mov_b32 s9, s10
16809; GFX10-CU-NEXT:    s_mov_b32 s8, s11
16810; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
16811; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
16812; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
16813; GFX10-CU-NEXT:    s_mov_b32 s5, s8
16814; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
16815; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16816; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16817; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16818; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16819; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16820; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16821; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16822; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16823; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16824; GFX10-CU-NEXT:    buffer_gl1_inv
16825; GFX10-CU-NEXT:    buffer_gl0_inv
16826; GFX10-CU-NEXT:    s_endpgm
16827;
16828; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16829; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16830; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
16831; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
16832; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
16833; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
16834; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
16835; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16836; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
16837; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
16838; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
16839; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
16840; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
16841; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
16842; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
16843; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
16844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
16845; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16846; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16847; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16848; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16849; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16850; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16851; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
16852; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16853; SKIP-CACHE-INV-NEXT:    s_endpgm
16854;
16855; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16856; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16857; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16858; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16859; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16860; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16861; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16862; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16863; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16864; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16865; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16866; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
16867; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16868; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16869; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16870; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
16871; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
16872; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16873;
16874; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16875; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16876; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16877; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16878; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16879; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16880; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16881; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16882; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16883; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16884; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16885; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
16886; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16887; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
16888; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16889; GFX90A-TGSPLIT-NEXT:    buffer_invl2
16890; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
16891; GFX90A-TGSPLIT-NEXT:    s_endpgm
16892;
16893; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16894; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16895; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16896; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16897; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16898; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16899; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16900; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16901; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16902; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16903; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16904; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16905; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16906; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16907; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16908; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
16909; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16910;
16911; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16912; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16913; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16914; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16915; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16916; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16917; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16918; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16919; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16920; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16921; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16922; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
16923; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16924; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
16925; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16926; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
16927; GFX940-TGSPLIT-NEXT:    s_endpgm
16928;
16929; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16930; GFX11-WGP:       ; %bb.0: ; %entry
16931; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16932; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16933; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16934; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16935; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16936; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16937; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16938; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16939; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16940; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16941; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16942; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16943; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16944; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
16945; GFX11-WGP-NEXT:    buffer_gl1_inv
16946; GFX11-WGP-NEXT:    buffer_gl0_inv
16947; GFX11-WGP-NEXT:    s_endpgm
16948;
16949; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16950; GFX11-CU:       ; %bb.0: ; %entry
16951; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16952; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16953; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16954; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16955; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16956; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16957; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16958; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16959; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16960; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16961; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16962; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16963; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
16964; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
16965; GFX11-CU-NEXT:    buffer_gl1_inv
16966; GFX11-CU-NEXT:    buffer_gl0_inv
16967; GFX11-CU-NEXT:    s_endpgm
16968;
16969; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16970; GFX12-WGP:       ; %bb.0: ; %entry
16971; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16972; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16973; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16974; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16975; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16976; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16977; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16978; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16979; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16980; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16981; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
16982; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
16983; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
16984; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16985; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16986; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
16987; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
16988; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
16989; GFX12-WGP-NEXT:    s_endpgm
16990;
16991; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
16992; GFX12-CU:       ; %bb.0: ; %entry
16993; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16994; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16995; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16996; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16997; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16998; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16999; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17000; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17001; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17002; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17003; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
17004; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
17005; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
17006; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17007; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17008; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17009; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17010; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
17011; GFX12-CU-NEXT:    s_endpgm
17012    ptr %out, i32 %in, i32 %old) {
17013entry:
17014  %gep = getelementptr i32, ptr %out, i32 4
17015  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
17016  ret void
17017}
17018
17019define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
17020; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17021; GFX7:       ; %bb.0: ; %entry
17022; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
17023; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17024; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
17025; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
17026; GFX7-NEXT:    s_mov_b64 s[10:11], 16
17027; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17028; GFX7-NEXT:    s_mov_b32 s4, s8
17029; GFX7-NEXT:    s_mov_b32 s5, s9
17030; GFX7-NEXT:    s_mov_b32 s9, s10
17031; GFX7-NEXT:    s_mov_b32 s8, s11
17032; GFX7-NEXT:    s_add_u32 s4, s4, s9
17033; GFX7-NEXT:    s_addc_u32 s8, s5, s8
17034; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17035; GFX7-NEXT:    s_mov_b32 s5, s8
17036; GFX7-NEXT:    v_mov_b32_e32 v2, s7
17037; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17038; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17039; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17040; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17041; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17042; GFX7-NEXT:    s_waitcnt vmcnt(0)
17043; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17044; GFX7-NEXT:    s_waitcnt vmcnt(0)
17045; GFX7-NEXT:    buffer_wbinvl1_vol
17046; GFX7-NEXT:    s_endpgm
17047;
17048; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17049; GFX10-WGP:       ; %bb.0: ; %entry
17050; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
17051; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17052; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
17053; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
17054; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
17055; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17056; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
17057; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
17058; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
17059; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
17060; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
17061; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
17062; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17063; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
17064; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
17065; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17066; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17067; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17068; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17069; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17070; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17071; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17072; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17073; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17074; GFX10-WGP-NEXT:    buffer_gl1_inv
17075; GFX10-WGP-NEXT:    buffer_gl0_inv
17076; GFX10-WGP-NEXT:    s_endpgm
17077;
17078; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17079; GFX10-CU:       ; %bb.0: ; %entry
17080; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
17081; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17082; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
17083; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
17084; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
17085; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17086; GFX10-CU-NEXT:    s_mov_b32 s4, s8
17087; GFX10-CU-NEXT:    s_mov_b32 s5, s9
17088; GFX10-CU-NEXT:    s_mov_b32 s9, s10
17089; GFX10-CU-NEXT:    s_mov_b32 s8, s11
17090; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
17091; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
17092; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17093; GFX10-CU-NEXT:    s_mov_b32 s5, s8
17094; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
17095; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17096; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17097; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17098; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17099; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17100; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17101; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17102; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17103; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17104; GFX10-CU-NEXT:    buffer_gl1_inv
17105; GFX10-CU-NEXT:    buffer_gl0_inv
17106; GFX10-CU-NEXT:    s_endpgm
17107;
17108; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17109; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17110; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
17111; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
17112; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
17113; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
17114; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
17115; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17116; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
17117; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
17118; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
17119; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
17120; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
17121; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
17122; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
17123; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
17124; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
17125; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17126; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17127; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17128; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17129; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17130; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17131; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17132; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17133; SKIP-CACHE-INV-NEXT:    s_endpgm
17134;
17135; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17136; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17137; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17138; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17139; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17140; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17141; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17142; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17143; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17144; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17145; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17146; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
17147; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17148; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17149; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17150; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
17151; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
17152; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17153;
17154; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17155; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17156; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17157; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17158; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17159; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17160; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17161; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17162; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17163; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17164; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17165; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
17166; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17167; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17168; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17169; GFX90A-TGSPLIT-NEXT:    buffer_invl2
17170; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17171; GFX90A-TGSPLIT-NEXT:    s_endpgm
17172;
17173; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17174; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17175; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17176; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17177; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17178; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17179; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17180; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17181; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17182; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17183; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17184; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17185; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17186; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17187; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17188; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
17189; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17190;
17191; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17192; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17193; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17194; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17195; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17196; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17197; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17198; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17199; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17200; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17201; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17202; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17203; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17204; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17205; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17206; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
17207; GFX940-TGSPLIT-NEXT:    s_endpgm
17208;
17209; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17210; GFX11-WGP:       ; %bb.0: ; %entry
17211; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17212; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17213; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17214; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17215; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17216; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17217; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17218; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17219; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17220; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17221; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17222; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17223; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17224; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17225; GFX11-WGP-NEXT:    buffer_gl1_inv
17226; GFX11-WGP-NEXT:    buffer_gl0_inv
17227; GFX11-WGP-NEXT:    s_endpgm
17228;
17229; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17230; GFX11-CU:       ; %bb.0: ; %entry
17231; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17232; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17233; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17234; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17235; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17236; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17237; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17238; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17239; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17240; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17241; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17242; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17243; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17244; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17245; GFX11-CU-NEXT:    buffer_gl1_inv
17246; GFX11-CU-NEXT:    buffer_gl0_inv
17247; GFX11-CU-NEXT:    s_endpgm
17248;
17249; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17250; GFX12-WGP:       ; %bb.0: ; %entry
17251; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17252; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17253; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17254; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17255; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17256; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17257; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17258; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17259; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17260; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17261; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
17262; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17263; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17264; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17265; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17266; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17267; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17268; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
17269; GFX12-WGP-NEXT:    s_endpgm
17270;
17271; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
17272; GFX12-CU:       ; %bb.0: ; %entry
17273; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17274; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17275; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17276; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17277; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17278; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17279; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17280; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17281; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17282; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17283; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
17284; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
17285; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
17286; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17287; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17288; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17289; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17290; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
17291; GFX12-CU-NEXT:    s_endpgm
17292    ptr %out, i32 %in, i32 %old) {
17293entry:
17294  %gep = getelementptr i32, ptr %out, i32 4
17295  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
17296  ret void
17297}
17298
17299define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
17300; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17301; GFX7:       ; %bb.0: ; %entry
17302; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
17303; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17304; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
17305; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
17306; GFX7-NEXT:    s_mov_b64 s[10:11], 16
17307; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17308; GFX7-NEXT:    s_mov_b32 s4, s8
17309; GFX7-NEXT:    s_mov_b32 s5, s9
17310; GFX7-NEXT:    s_mov_b32 s9, s10
17311; GFX7-NEXT:    s_mov_b32 s8, s11
17312; GFX7-NEXT:    s_add_u32 s4, s4, s9
17313; GFX7-NEXT:    s_addc_u32 s8, s5, s8
17314; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17315; GFX7-NEXT:    s_mov_b32 s5, s8
17316; GFX7-NEXT:    v_mov_b32_e32 v2, s7
17317; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17318; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17319; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17320; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17321; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17322; GFX7-NEXT:    s_waitcnt vmcnt(0)
17323; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17324; GFX7-NEXT:    s_waitcnt vmcnt(0)
17325; GFX7-NEXT:    buffer_wbinvl1_vol
17326; GFX7-NEXT:    s_endpgm
17327;
17328; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17329; GFX10-WGP:       ; %bb.0: ; %entry
17330; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
17331; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17332; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
17333; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
17334; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
17335; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17336; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
17337; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
17338; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
17339; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
17340; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
17341; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
17342; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17343; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
17344; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
17345; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17346; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17347; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17348; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17349; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17350; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17351; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17352; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17353; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17354; GFX10-WGP-NEXT:    buffer_gl1_inv
17355; GFX10-WGP-NEXT:    buffer_gl0_inv
17356; GFX10-WGP-NEXT:    s_endpgm
17357;
17358; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17359; GFX10-CU:       ; %bb.0: ; %entry
17360; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
17361; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17362; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
17363; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
17364; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
17365; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17366; GFX10-CU-NEXT:    s_mov_b32 s4, s8
17367; GFX10-CU-NEXT:    s_mov_b32 s5, s9
17368; GFX10-CU-NEXT:    s_mov_b32 s9, s10
17369; GFX10-CU-NEXT:    s_mov_b32 s8, s11
17370; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
17371; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
17372; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17373; GFX10-CU-NEXT:    s_mov_b32 s5, s8
17374; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
17375; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17376; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17377; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17378; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17379; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17380; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17381; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17382; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17383; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17384; GFX10-CU-NEXT:    buffer_gl1_inv
17385; GFX10-CU-NEXT:    buffer_gl0_inv
17386; GFX10-CU-NEXT:    s_endpgm
17387;
17388; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17389; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17390; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
17391; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
17392; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
17393; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
17394; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
17395; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17396; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
17397; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
17398; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
17399; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
17400; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
17401; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
17402; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
17403; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
17404; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
17405; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17406; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17409; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17410; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17411; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17412; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17413; SKIP-CACHE-INV-NEXT:    s_endpgm
17414;
17415; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17416; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17417; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17418; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17419; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17420; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17421; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17422; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17423; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17424; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17425; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17426; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
17427; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17428; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17429; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17430; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
17431; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
17432; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17433;
17434; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17435; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17436; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17437; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17438; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17439; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17440; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17441; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17442; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17443; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17444; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17445; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
17446; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17447; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17448; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17449; GFX90A-TGSPLIT-NEXT:    buffer_invl2
17450; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17451; GFX90A-TGSPLIT-NEXT:    s_endpgm
17452;
17453; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17454; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17455; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17456; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17457; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17458; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17459; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17460; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17461; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17462; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17463; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17464; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17465; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17466; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17467; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17468; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
17469; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17470;
17471; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17472; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17473; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17474; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17475; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17476; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17477; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17478; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17479; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17480; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17481; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17482; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17483; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17484; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17485; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17486; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
17487; GFX940-TGSPLIT-NEXT:    s_endpgm
17488;
17489; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17490; GFX11-WGP:       ; %bb.0: ; %entry
17491; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17492; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17493; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17494; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17495; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17496; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17497; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17498; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17499; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17500; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17501; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17502; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17503; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17504; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17505; GFX11-WGP-NEXT:    buffer_gl1_inv
17506; GFX11-WGP-NEXT:    buffer_gl0_inv
17507; GFX11-WGP-NEXT:    s_endpgm
17508;
17509; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17510; GFX11-CU:       ; %bb.0: ; %entry
17511; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17512; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17513; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17514; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17515; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17516; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17517; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17518; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17519; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17520; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17521; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17522; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17523; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17524; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17525; GFX11-CU-NEXT:    buffer_gl1_inv
17526; GFX11-CU-NEXT:    buffer_gl0_inv
17527; GFX11-CU-NEXT:    s_endpgm
17528;
17529; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17530; GFX12-WGP:       ; %bb.0: ; %entry
17531; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17532; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17533; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17534; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17535; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17536; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17537; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17538; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17539; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17540; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17541; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
17542; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17543; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17544; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17545; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17546; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17547; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17548; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
17549; GFX12-WGP-NEXT:    s_endpgm
17550;
17551; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
17552; GFX12-CU:       ; %bb.0: ; %entry
17553; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17554; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17555; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17556; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17557; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17558; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17559; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17560; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17561; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17562; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17563; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
17564; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
17565; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
17566; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17567; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17568; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17569; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17570; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
17571; GFX12-CU-NEXT:    s_endpgm
17572    ptr %out, i32 %in, i32 %old) {
17573entry:
17574  %gep = getelementptr i32, ptr %out, i32 4
17575  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
17576  ret void
17577}
17578
17579define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
17580; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17581; GFX7:       ; %bb.0: ; %entry
17582; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
17583; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17584; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
17585; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
17586; GFX7-NEXT:    s_mov_b64 s[10:11], 16
17587; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17588; GFX7-NEXT:    s_mov_b32 s4, s8
17589; GFX7-NEXT:    s_mov_b32 s5, s9
17590; GFX7-NEXT:    s_mov_b32 s9, s10
17591; GFX7-NEXT:    s_mov_b32 s8, s11
17592; GFX7-NEXT:    s_add_u32 s4, s4, s9
17593; GFX7-NEXT:    s_addc_u32 s8, s5, s8
17594; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17595; GFX7-NEXT:    s_mov_b32 s5, s8
17596; GFX7-NEXT:    v_mov_b32_e32 v2, s7
17597; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17598; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17599; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17600; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17601; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17602; GFX7-NEXT:    s_waitcnt vmcnt(0)
17603; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17604; GFX7-NEXT:    s_waitcnt vmcnt(0)
17605; GFX7-NEXT:    buffer_wbinvl1_vol
17606; GFX7-NEXT:    s_endpgm
17607;
17608; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17609; GFX10-WGP:       ; %bb.0: ; %entry
17610; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
17611; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17612; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
17613; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
17614; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
17615; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17616; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
17617; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
17618; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
17619; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
17620; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
17621; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
17622; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17623; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
17624; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
17625; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17626; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17627; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17628; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17629; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17630; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17631; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17632; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17633; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17634; GFX10-WGP-NEXT:    buffer_gl1_inv
17635; GFX10-WGP-NEXT:    buffer_gl0_inv
17636; GFX10-WGP-NEXT:    s_endpgm
17637;
17638; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17639; GFX10-CU:       ; %bb.0: ; %entry
17640; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
17641; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17642; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
17643; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
17644; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
17645; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17646; GFX10-CU-NEXT:    s_mov_b32 s4, s8
17647; GFX10-CU-NEXT:    s_mov_b32 s5, s9
17648; GFX10-CU-NEXT:    s_mov_b32 s9, s10
17649; GFX10-CU-NEXT:    s_mov_b32 s8, s11
17650; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
17651; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
17652; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17653; GFX10-CU-NEXT:    s_mov_b32 s5, s8
17654; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
17655; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17656; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17657; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17658; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17659; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17660; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17661; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17662; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17663; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17664; GFX10-CU-NEXT:    buffer_gl1_inv
17665; GFX10-CU-NEXT:    buffer_gl0_inv
17666; GFX10-CU-NEXT:    s_endpgm
17667;
17668; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17669; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17670; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
17671; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
17672; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
17673; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
17674; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
17675; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17676; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
17677; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
17678; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
17679; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
17680; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
17681; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
17682; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
17683; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
17684; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
17685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17686; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17688; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17690; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17691; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17692; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17693; SKIP-CACHE-INV-NEXT:    s_endpgm
17694;
17695; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17696; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17697; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17698; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17699; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17700; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17701; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17702; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17703; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17704; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17705; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17706; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
17707; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17708; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17709; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17710; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
17711; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
17712; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17713;
17714; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17715; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17716; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17717; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17718; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17719; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17720; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17721; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17722; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17723; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17724; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17725; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
17726; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17727; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17728; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17729; GFX90A-TGSPLIT-NEXT:    buffer_invl2
17730; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
17731; GFX90A-TGSPLIT-NEXT:    s_endpgm
17732;
17733; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17734; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17735; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17736; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17737; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17738; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17739; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17740; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17741; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17742; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17743; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17744; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17745; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17746; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17747; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17748; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
17749; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17750;
17751; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17752; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17753; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17754; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17755; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17756; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17757; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17758; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17759; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17760; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17761; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17762; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
17763; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17764; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
17765; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17766; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
17767; GFX940-TGSPLIT-NEXT:    s_endpgm
17768;
17769; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17770; GFX11-WGP:       ; %bb.0: ; %entry
17771; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17772; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17773; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17774; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17775; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17776; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17777; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17778; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17779; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17780; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17781; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17782; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17783; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17784; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17785; GFX11-WGP-NEXT:    buffer_gl1_inv
17786; GFX11-WGP-NEXT:    buffer_gl0_inv
17787; GFX11-WGP-NEXT:    s_endpgm
17788;
17789; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17790; GFX11-CU:       ; %bb.0: ; %entry
17791; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17792; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17793; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17794; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17795; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17796; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17797; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17798; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17799; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17800; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17801; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17802; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17803; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
17804; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17805; GFX11-CU-NEXT:    buffer_gl1_inv
17806; GFX11-CU-NEXT:    buffer_gl0_inv
17807; GFX11-CU-NEXT:    s_endpgm
17808;
17809; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17810; GFX12-WGP:       ; %bb.0: ; %entry
17811; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17812; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17813; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17814; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17815; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17816; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17817; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17818; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17819; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17820; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17821; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
17822; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
17823; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
17824; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17825; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17826; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17827; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
17828; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
17829; GFX12-WGP-NEXT:    s_endpgm
17830;
17831; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
17832; GFX12-CU:       ; %bb.0: ; %entry
17833; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17834; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17835; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17836; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17837; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17838; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17839; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17840; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17841; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17842; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17843; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
17844; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
17845; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
17846; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17847; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17848; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
17849; GFX12-CU-NEXT:    s_wait_storecnt 0x0
17850; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
17851; GFX12-CU-NEXT:    s_endpgm
17852    ptr %out, i32 %in, i32 %old) {
17853entry:
17854  %gep = getelementptr i32, ptr %out, i32 4
17855  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
17856  ret void
17857}
17858
17859define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
17860; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17861; GFX7:       ; %bb.0: ; %entry
17862; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
17863; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17864; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
17865; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
17866; GFX7-NEXT:    s_mov_b64 s[10:11], 16
17867; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17868; GFX7-NEXT:    s_mov_b32 s4, s8
17869; GFX7-NEXT:    s_mov_b32 s5, s9
17870; GFX7-NEXT:    s_mov_b32 s9, s10
17871; GFX7-NEXT:    s_mov_b32 s8, s11
17872; GFX7-NEXT:    s_add_u32 s4, s4, s9
17873; GFX7-NEXT:    s_addc_u32 s8, s5, s8
17874; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17875; GFX7-NEXT:    s_mov_b32 s5, s8
17876; GFX7-NEXT:    v_mov_b32_e32 v2, s7
17877; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17878; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17879; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17880; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17881; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17882; GFX7-NEXT:    s_waitcnt vmcnt(0)
17883; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17884; GFX7-NEXT:    s_waitcnt vmcnt(0)
17885; GFX7-NEXT:    buffer_wbinvl1_vol
17886; GFX7-NEXT:    s_endpgm
17887;
17888; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17889; GFX10-WGP:       ; %bb.0: ; %entry
17890; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
17891; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17892; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
17893; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
17894; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
17895; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17896; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
17897; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
17898; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
17899; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
17900; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
17901; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
17902; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17903; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
17904; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
17905; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17906; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17907; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17908; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17909; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17910; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17911; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17912; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17913; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
17914; GFX10-WGP-NEXT:    buffer_gl1_inv
17915; GFX10-WGP-NEXT:    buffer_gl0_inv
17916; GFX10-WGP-NEXT:    s_endpgm
17917;
17918; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17919; GFX10-CU:       ; %bb.0: ; %entry
17920; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
17921; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
17922; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
17923; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
17924; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
17925; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17926; GFX10-CU-NEXT:    s_mov_b32 s4, s8
17927; GFX10-CU-NEXT:    s_mov_b32 s5, s9
17928; GFX10-CU-NEXT:    s_mov_b32 s9, s10
17929; GFX10-CU-NEXT:    s_mov_b32 s8, s11
17930; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
17931; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
17932; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17933; GFX10-CU-NEXT:    s_mov_b32 s5, s8
17934; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
17935; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17936; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17937; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17938; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17939; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17940; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17941; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17942; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17943; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
17944; GFX10-CU-NEXT:    buffer_gl1_inv
17945; GFX10-CU-NEXT:    buffer_gl0_inv
17946; GFX10-CU-NEXT:    s_endpgm
17947;
17948; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17949; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17950; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
17951; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
17952; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
17953; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
17954; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
17955; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
17957; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
17958; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
17959; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
17960; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
17961; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
17962; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
17963; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
17964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
17965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17966; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17967; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17968; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17969; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17970; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17971; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
17972; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17973; SKIP-CACHE-INV-NEXT:    s_endpgm
17974;
17975; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17976; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17977; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17978; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17979; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17980; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17981; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17982; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17983; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17984; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17985; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17986; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
17987; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17988; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
17989; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17990; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
17991; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
17992; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17993;
17994; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
17995; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17996; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17997; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17998; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17999; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18000; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18001; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18002; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18003; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18004; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18005; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
18006; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18007; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
18008; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18009; GFX90A-TGSPLIT-NEXT:    buffer_invl2
18010; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18011; GFX90A-TGSPLIT-NEXT:    s_endpgm
18012;
18013; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18014; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18015; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18016; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18017; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18018; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18019; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18020; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18021; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18022; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18023; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18024; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
18025; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18026; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
18027; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18028; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
18029; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18030;
18031; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18032; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18033; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18034; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18035; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18036; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18037; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18038; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18039; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18040; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18041; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18042; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
18043; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18044; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
18045; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18046; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
18047; GFX940-TGSPLIT-NEXT:    s_endpgm
18048;
18049; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18050; GFX11-WGP:       ; %bb.0: ; %entry
18051; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18052; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18053; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18054; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18055; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18056; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18057; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18058; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18059; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18060; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18061; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18062; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18063; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
18064; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18065; GFX11-WGP-NEXT:    buffer_gl1_inv
18066; GFX11-WGP-NEXT:    buffer_gl0_inv
18067; GFX11-WGP-NEXT:    s_endpgm
18068;
18069; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18070; GFX11-CU:       ; %bb.0: ; %entry
18071; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18072; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18073; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18074; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18075; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18076; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18077; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18078; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18079; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18080; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18081; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18082; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18083; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
18084; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18085; GFX11-CU-NEXT:    buffer_gl1_inv
18086; GFX11-CU-NEXT:    buffer_gl0_inv
18087; GFX11-CU-NEXT:    s_endpgm
18088;
18089; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18090; GFX12-WGP:       ; %bb.0: ; %entry
18091; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18092; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18093; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18094; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18095; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18096; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18097; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18098; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18099; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18100; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18101; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
18102; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18103; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18104; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18105; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18106; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
18107; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18108; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
18109; GFX12-WGP-NEXT:    s_endpgm
18110;
18111; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
18112; GFX12-CU:       ; %bb.0: ; %entry
18113; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18114; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18115; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18116; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18117; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18118; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18119; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18120; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18121; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18122; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18123; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
18124; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
18125; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
18126; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18127; GFX12-CU-NEXT:    s_wait_storecnt 0x0
18128; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
18129; GFX12-CU-NEXT:    s_wait_storecnt 0x0
18130; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
18131; GFX12-CU-NEXT:    s_endpgm
18132    ptr %out, i32 %in, i32 %old) {
18133entry:
18134  %gep = getelementptr i32, ptr %out, i32 4
18135  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
18136  ret void
18137}
18138
18139define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
18140; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18141; GFX7:       ; %bb.0: ; %entry
18142; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
18143; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
18144; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
18145; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
18146; GFX7-NEXT:    s_mov_b64 s[10:11], 16
18147; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18148; GFX7-NEXT:    s_mov_b32 s4, s8
18149; GFX7-NEXT:    s_mov_b32 s5, s9
18150; GFX7-NEXT:    s_mov_b32 s9, s10
18151; GFX7-NEXT:    s_mov_b32 s8, s11
18152; GFX7-NEXT:    s_add_u32 s4, s4, s9
18153; GFX7-NEXT:    s_addc_u32 s8, s5, s8
18154; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
18155; GFX7-NEXT:    s_mov_b32 s5, s8
18156; GFX7-NEXT:    v_mov_b32_e32 v2, s7
18157; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18158; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18159; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18160; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18161; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18162; GFX7-NEXT:    s_waitcnt vmcnt(0)
18163; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
18164; GFX7-NEXT:    s_waitcnt vmcnt(0)
18165; GFX7-NEXT:    buffer_wbinvl1_vol
18166; GFX7-NEXT:    s_endpgm
18167;
18168; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18169; GFX10-WGP:       ; %bb.0: ; %entry
18170; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
18171; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
18172; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
18173; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
18174; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
18175; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18176; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
18177; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
18178; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
18179; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
18180; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
18181; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
18182; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
18183; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
18184; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
18185; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18186; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18187; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18188; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18189; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18190; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18191; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18192; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
18193; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18194; GFX10-WGP-NEXT:    buffer_gl1_inv
18195; GFX10-WGP-NEXT:    buffer_gl0_inv
18196; GFX10-WGP-NEXT:    s_endpgm
18197;
18198; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18199; GFX10-CU:       ; %bb.0: ; %entry
18200; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
18201; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
18202; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
18203; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
18204; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
18205; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18206; GFX10-CU-NEXT:    s_mov_b32 s4, s8
18207; GFX10-CU-NEXT:    s_mov_b32 s5, s9
18208; GFX10-CU-NEXT:    s_mov_b32 s9, s10
18209; GFX10-CU-NEXT:    s_mov_b32 s8, s11
18210; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
18211; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
18212; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
18213; GFX10-CU-NEXT:    s_mov_b32 s5, s8
18214; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
18215; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18216; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18217; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18218; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18219; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18220; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18221; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18222; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
18223; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18224; GFX10-CU-NEXT:    buffer_gl1_inv
18225; GFX10-CU-NEXT:    buffer_gl0_inv
18226; GFX10-CU-NEXT:    s_endpgm
18227;
18228; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18229; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18230; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
18231; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
18232; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
18233; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
18234; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
18235; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18236; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
18237; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
18238; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
18239; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
18240; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
18241; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
18242; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
18243; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
18244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
18245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18246; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18247; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18248; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18250; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18251; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
18252; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18253; SKIP-CACHE-INV-NEXT:    s_endpgm
18254;
18255; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18256; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18257; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18258; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18259; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18260; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18261; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18262; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18263; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18264; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18265; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18266; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
18267; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18268; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
18269; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18270; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
18271; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
18272; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18273;
18274; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18275; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18276; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18277; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18278; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18279; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18280; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18281; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18282; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18283; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18284; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18285; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
18286; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18287; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
18288; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18289; GFX90A-TGSPLIT-NEXT:    buffer_invl2
18290; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18291; GFX90A-TGSPLIT-NEXT:    s_endpgm
18292;
18293; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18294; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18295; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18296; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18297; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18298; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18299; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18300; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18301; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18302; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18303; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18304; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
18305; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18306; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
18307; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18308; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
18309; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18310;
18311; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18312; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18313; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18314; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18315; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18316; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18317; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18318; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18319; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18320; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18321; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18322; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
18323; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18324; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
18325; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18326; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
18327; GFX940-TGSPLIT-NEXT:    s_endpgm
18328;
18329; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18330; GFX11-WGP:       ; %bb.0: ; %entry
18331; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18332; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18333; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18334; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18335; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18336; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18337; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18338; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18339; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18340; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18341; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18342; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18343; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
18344; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
18345; GFX11-WGP-NEXT:    buffer_gl1_inv
18346; GFX11-WGP-NEXT:    buffer_gl0_inv
18347; GFX11-WGP-NEXT:    s_endpgm
18348;
18349; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18350; GFX11-CU:       ; %bb.0: ; %entry
18351; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18352; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18353; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18354; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18355; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18356; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18357; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18358; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18359; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18360; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18361; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18362; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18363; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
18364; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
18365; GFX11-CU-NEXT:    buffer_gl1_inv
18366; GFX11-CU-NEXT:    buffer_gl0_inv
18367; GFX11-CU-NEXT:    s_endpgm
18368;
18369; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18370; GFX12-WGP:       ; %bb.0: ; %entry
18371; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18372; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18373; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18374; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18375; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18376; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18377; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18378; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18379; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18380; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18381; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
18382; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
18383; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
18384; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18385; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18386; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
18387; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
18388; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
18389; GFX12-WGP-NEXT:    s_endpgm
18390;
18391; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
18392; GFX12-CU:       ; %bb.0: ; %entry
18393; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18394; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18395; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18396; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18397; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18398; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18399; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18400; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18401; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18402; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18403; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
18404; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
18405; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
18406; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18407; GFX12-CU-NEXT:    s_wait_storecnt 0x0
18408; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
18409; GFX12-CU-NEXT:    s_wait_storecnt 0x0
18410; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
18411; GFX12-CU-NEXT:    s_endpgm
18412    ptr %out, i32 %in, i32 %old) {
18413entry:
18414  %gep = getelementptr i32, ptr %out, i32 4
18415  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
18416  ret void
18417}
18418
18419define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
18420; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18421; GFX7:       ; %bb.0: ; %entry
18422; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18423; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18424; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18425; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18426; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18427; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18428; GFX7-NEXT:    s_mov_b32 s6, s4
18429; GFX7-NEXT:    s_mov_b32 s7, s5
18430; GFX7-NEXT:    s_mov_b32 s11, s12
18431; GFX7-NEXT:    s_mov_b32 s10, s13
18432; GFX7-NEXT:    s_add_u32 s6, s6, s11
18433; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18434; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18435; GFX7-NEXT:    s_mov_b32 s7, s10
18436; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18437; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18438; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18439; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18440; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18441; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18442; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18443; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18444; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18445; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18446; GFX7-NEXT:    flat_store_dword v[0:1], v2
18447; GFX7-NEXT:    s_endpgm
18448;
18449; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18450; GFX10-WGP:       ; %bb.0: ; %entry
18451; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18452; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18453; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18454; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18455; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18456; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18457; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18458; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18459; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18460; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18461; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18462; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18463; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18464; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18465; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18466; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18467; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18468; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18469; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18470; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18471; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18472; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18473; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18474; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18475; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18476; GFX10-WGP-NEXT:    s_endpgm
18477;
18478; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18479; GFX10-CU:       ; %bb.0: ; %entry
18480; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18481; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18482; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18483; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18484; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18485; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18486; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18487; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18488; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18489; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18490; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18491; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18492; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18493; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18494; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18495; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18496; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18497; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18498; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18499; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18500; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18501; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18502; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18503; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18504; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18505; GFX10-CU-NEXT:    s_endpgm
18506;
18507; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18508; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18509; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18510; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18511; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18512; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18513; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18514; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18515; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18516; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18517; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18519; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18520; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18521; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18522; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18523; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18524; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18525; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18526; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18529; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18530; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18531; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18532; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18533; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18534; SKIP-CACHE-INV-NEXT:    s_endpgm
18535;
18536; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18537; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18538; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18539; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18540; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18541; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18542; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18543; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18544; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18545; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18546; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18547; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18548; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18549; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18550; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18551; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18552;
18553; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18554; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18555; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18556; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18557; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18558; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18559; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18560; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18561; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18562; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18563; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18564; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18565; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18566; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18567; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18568; GFX90A-TGSPLIT-NEXT:    s_endpgm
18569;
18570; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18571; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18572; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18573; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18574; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18575; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18576; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18577; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18578; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18579; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18580; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18581; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
18582; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18583; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18584; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18585; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18586;
18587; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18588; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18589; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18590; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18591; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18592; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18593; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18594; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18595; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18596; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18597; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18598; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
18599; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18600; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18601; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18602; GFX940-TGSPLIT-NEXT:    s_endpgm
18603;
18604; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18605; GFX11-WGP:       ; %bb.0: ; %entry
18606; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18607; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18608; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18609; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18610; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18611; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18612; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18613; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18614; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18615; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18616; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18617; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18618; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18619; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18620; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18621; GFX11-WGP-NEXT:    s_endpgm
18622;
18623; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18624; GFX11-CU:       ; %bb.0: ; %entry
18625; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18626; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18627; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18628; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18629; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18630; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18631; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18632; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18633; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18634; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18635; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18636; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18637; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18638; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18639; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18640; GFX11-CU-NEXT:    s_endpgm
18641;
18642; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18643; GFX12-WGP:       ; %bb.0: ; %entry
18644; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18645; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18646; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18647; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18648; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18649; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18650; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18651; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18652; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18653; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18654; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18655; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18656; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18657; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
18658; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18659; GFX12-WGP-NEXT:    s_endpgm
18660;
18661; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
18662; GFX12-CU:       ; %bb.0: ; %entry
18663; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18664; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18665; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18666; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18667; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18668; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18669; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18670; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18671; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18672; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18673; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18674; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18675; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18676; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18677; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18678; GFX12-CU-NEXT:    s_endpgm
18679    ptr %out, i32 %in, i32 %old) {
18680entry:
18681  %gep = getelementptr i32, ptr %out, i32 4
18682  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
18683  %val0 = extractvalue { i32, i1 } %val, 0
18684  store i32 %val0, ptr %out, align 4
18685  ret void
18686}
18687
18688define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
18689; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18690; GFX7:       ; %bb.0: ; %entry
18691; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18692; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18693; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18694; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18695; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18696; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18697; GFX7-NEXT:    s_mov_b32 s6, s4
18698; GFX7-NEXT:    s_mov_b32 s7, s5
18699; GFX7-NEXT:    s_mov_b32 s11, s12
18700; GFX7-NEXT:    s_mov_b32 s10, s13
18701; GFX7-NEXT:    s_add_u32 s6, s6, s11
18702; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18703; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18704; GFX7-NEXT:    s_mov_b32 s7, s10
18705; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18706; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18707; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18708; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18709; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18710; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18711; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18712; GFX7-NEXT:    s_waitcnt vmcnt(0)
18713; GFX7-NEXT:    buffer_wbinvl1_vol
18714; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18715; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18716; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18717; GFX7-NEXT:    flat_store_dword v[0:1], v2
18718; GFX7-NEXT:    s_endpgm
18719;
18720; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18721; GFX10-WGP:       ; %bb.0: ; %entry
18722; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18723; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18724; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18725; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18726; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18727; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18728; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18729; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18730; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18731; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18732; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18733; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18734; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18735; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18736; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18737; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18738; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18739; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18740; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18741; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18742; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18743; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18744; GFX10-WGP-NEXT:    buffer_gl1_inv
18745; GFX10-WGP-NEXT:    buffer_gl0_inv
18746; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18747; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18748; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18749; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18750; GFX10-WGP-NEXT:    s_endpgm
18751;
18752; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18753; GFX10-CU:       ; %bb.0: ; %entry
18754; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18755; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18756; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18757; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18758; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18759; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18760; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18761; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18762; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18763; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18764; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18765; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18766; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18767; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18768; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18769; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18770; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18771; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18772; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18773; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18774; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18775; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18776; GFX10-CU-NEXT:    buffer_gl1_inv
18777; GFX10-CU-NEXT:    buffer_gl0_inv
18778; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18779; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18780; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18781; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18782; GFX10-CU-NEXT:    s_endpgm
18783;
18784; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18785; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18786; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18787; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18788; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18789; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18790; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18791; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18792; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18793; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18794; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18795; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18796; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18797; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18798; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18799; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18800; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18801; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18802; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18806; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18807; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18809; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18810; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18811; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18812; SKIP-CACHE-INV-NEXT:    s_endpgm
18813;
18814; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18815; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18816; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18817; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18818; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18819; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18820; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18821; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18822; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18823; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18824; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18825; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18826; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18827; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
18828; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
18829; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18830; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18831; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18832; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18833;
18834; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18835; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18836; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18837; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18838; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18839; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18840; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18841; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18842; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18843; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18844; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18845; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18846; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18847; GFX90A-TGSPLIT-NEXT:    buffer_invl2
18848; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
18849; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18850; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18851; GFX90A-TGSPLIT-NEXT:    s_endpgm
18852;
18853; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18854; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18855; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18856; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18857; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18858; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18859; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18860; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18861; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18862; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18863; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18864; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
18865; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18866; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
18867; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18868; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18869; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18870; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18871;
18872; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18873; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18874; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18875; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18876; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18877; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18878; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18879; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18880; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18881; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18882; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18883; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
18884; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18885; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
18886; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18887; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18888; GFX940-TGSPLIT-NEXT:    s_endpgm
18889;
18890; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18891; GFX11-WGP:       ; %bb.0: ; %entry
18892; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18893; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18894; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18895; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18896; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18897; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18898; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18899; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18900; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18901; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18902; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18903; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18904; GFX11-WGP-NEXT:    buffer_gl1_inv
18905; GFX11-WGP-NEXT:    buffer_gl0_inv
18906; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18907; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18908; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18909; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18910; GFX11-WGP-NEXT:    s_endpgm
18911;
18912; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18913; GFX11-CU:       ; %bb.0: ; %entry
18914; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18915; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18916; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18917; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18918; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18919; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18920; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18921; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18922; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18923; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18924; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18925; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18926; GFX11-CU-NEXT:    buffer_gl1_inv
18927; GFX11-CU-NEXT:    buffer_gl0_inv
18928; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18929; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18930; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18931; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18932; GFX11-CU-NEXT:    s_endpgm
18933;
18934; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18935; GFX12-WGP:       ; %bb.0: ; %entry
18936; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18937; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18938; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18939; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18940; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18941; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18942; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18943; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18944; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18945; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18946; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18947; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18948; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
18949; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18950; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18951; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
18952; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18953; GFX12-WGP-NEXT:    s_endpgm
18954;
18955; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
18956; GFX12-CU:       ; %bb.0: ; %entry
18957; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18958; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18959; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18960; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18961; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18962; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18963; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18964; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18965; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18966; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18967; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18968; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18969; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
18970; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18971; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18972; GFX12-CU-NEXT:    s_wait_dscnt 0x0
18973; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18974; GFX12-CU-NEXT:    s_endpgm
18975    ptr %out, i32 %in, i32 %old) {
18976entry:
18977  %gep = getelementptr i32, ptr %out, i32 4
18978  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
18979  %val0 = extractvalue { i32, i1 } %val, 0
18980  store i32 %val0, ptr %out, align 4
18981  ret void
18982}
18983
18984define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
18985; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
18986; GFX7:       ; %bb.0: ; %entry
18987; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18988; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18989; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18990; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18991; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18992; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18993; GFX7-NEXT:    s_mov_b32 s6, s4
18994; GFX7-NEXT:    s_mov_b32 s7, s5
18995; GFX7-NEXT:    s_mov_b32 s11, s12
18996; GFX7-NEXT:    s_mov_b32 s10, s13
18997; GFX7-NEXT:    s_add_u32 s6, s6, s11
18998; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18999; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19000; GFX7-NEXT:    s_mov_b32 s7, s10
19001; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19002; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19003; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19004; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19005; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19006; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19007; GFX7-NEXT:    s_waitcnt vmcnt(0)
19008; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19009; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19010; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19011; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19012; GFX7-NEXT:    flat_store_dword v[0:1], v2
19013; GFX7-NEXT:    s_endpgm
19014;
19015; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19016; GFX10-WGP:       ; %bb.0: ; %entry
19017; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19018; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19019; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19020; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19021; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19022; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19023; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19024; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19025; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19026; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19027; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19028; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19029; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19030; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19031; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19032; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19033; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19034; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19035; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19036; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19037; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19038; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19039; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19040; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19041; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19042; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19043; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19044; GFX10-WGP-NEXT:    s_endpgm
19045;
19046; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19047; GFX10-CU:       ; %bb.0: ; %entry
19048; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19049; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19050; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19051; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19052; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19053; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19054; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19055; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19056; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19057; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19058; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19059; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19060; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19061; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19062; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19063; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19064; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19065; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19066; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19067; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19068; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19069; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19070; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19071; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19072; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19073; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19074; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19075; GFX10-CU-NEXT:    s_endpgm
19076;
19077; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19078; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19079; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19080; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19081; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19082; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19083; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19084; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19085; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19086; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19087; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19088; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19089; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19090; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19091; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19092; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19093; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19095; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19097; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19099; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19100; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19101; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19102; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19103; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19104; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19105; SKIP-CACHE-INV-NEXT:    s_endpgm
19106;
19107; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19108; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19109; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19110; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19111; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19112; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19113; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19114; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19115; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19116; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19117; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19118; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
19119; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19120; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19121; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19122; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19123; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19124; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19125;
19126; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19127; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19128; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19129; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19130; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19131; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19132; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19133; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19134; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19135; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19136; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19137; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
19138; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19139; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19140; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19141; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19142; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19143; GFX90A-TGSPLIT-NEXT:    s_endpgm
19144;
19145; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19146; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19147; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19148; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19149; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19150; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19151; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19152; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19153; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19154; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19155; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19156; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19157; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19158; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19159; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19160; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19161; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19162; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19163;
19164; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19165; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19166; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19167; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19168; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19169; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19170; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19171; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19172; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19173; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19174; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19175; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19176; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19177; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19178; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19179; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19180; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19181; GFX940-TGSPLIT-NEXT:    s_endpgm
19182;
19183; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19184; GFX11-WGP:       ; %bb.0: ; %entry
19185; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19186; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19187; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19188; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19189; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19190; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19191; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19192; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19193; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19194; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19195; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19196; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19197; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19198; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19199; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19200; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19201; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19202; GFX11-WGP-NEXT:    s_endpgm
19203;
19204; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19205; GFX11-CU:       ; %bb.0: ; %entry
19206; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19207; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19208; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19209; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19210; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19211; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19212; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19213; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19214; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19215; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19216; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19217; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19218; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19219; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19220; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19221; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19222; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19223; GFX11-CU-NEXT:    s_endpgm
19224;
19225; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19226; GFX12-WGP:       ; %bb.0: ; %entry
19227; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19228; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19229; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19230; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19231; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19232; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19233; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19234; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19235; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19236; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19237; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
19238; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19239; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19240; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19241; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19242; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19243; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19244; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19245; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
19246; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19247; GFX12-WGP-NEXT:    s_endpgm
19248;
19249; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
19250; GFX12-CU:       ; %bb.0: ; %entry
19251; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19252; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19253; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19254; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19255; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19256; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19257; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19258; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19259; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19260; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19261; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
19262; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
19263; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
19264; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19265; GFX12-CU-NEXT:    s_wait_storecnt 0x0
19266; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19267; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19268; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19269; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19270; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19271; GFX12-CU-NEXT:    s_endpgm
19272    ptr %out, i32 %in, i32 %old) {
19273entry:
19274  %gep = getelementptr i32, ptr %out, i32 4
19275  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
19276  %val0 = extractvalue { i32, i1 } %val, 0
19277  store i32 %val0, ptr %out, align 4
19278  ret void
19279}
19280
19281define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
19282; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19283; GFX7:       ; %bb.0: ; %entry
19284; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19285; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19286; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19287; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19288; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19289; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19290; GFX7-NEXT:    s_mov_b32 s6, s4
19291; GFX7-NEXT:    s_mov_b32 s7, s5
19292; GFX7-NEXT:    s_mov_b32 s11, s12
19293; GFX7-NEXT:    s_mov_b32 s10, s13
19294; GFX7-NEXT:    s_add_u32 s6, s6, s11
19295; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19296; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19297; GFX7-NEXT:    s_mov_b32 s7, s10
19298; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19299; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19300; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19301; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19302; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19303; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19304; GFX7-NEXT:    s_waitcnt vmcnt(0)
19305; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19306; GFX7-NEXT:    s_waitcnt vmcnt(0)
19307; GFX7-NEXT:    buffer_wbinvl1_vol
19308; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19309; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19310; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19311; GFX7-NEXT:    flat_store_dword v[0:1], v2
19312; GFX7-NEXT:    s_endpgm
19313;
19314; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19315; GFX10-WGP:       ; %bb.0: ; %entry
19316; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19317; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19318; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19319; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19320; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19321; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19322; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19323; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19324; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19325; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19326; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19327; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19328; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19329; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19330; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19331; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19332; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19333; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19334; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19335; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19336; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19337; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19338; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19339; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19340; GFX10-WGP-NEXT:    buffer_gl1_inv
19341; GFX10-WGP-NEXT:    buffer_gl0_inv
19342; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19343; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19344; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19345; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19346; GFX10-WGP-NEXT:    s_endpgm
19347;
19348; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19349; GFX10-CU:       ; %bb.0: ; %entry
19350; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19351; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19352; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19353; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19354; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19355; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19356; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19357; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19358; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19359; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19360; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19361; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19362; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19363; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19364; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19365; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19366; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19367; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19368; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19369; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19370; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19371; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19372; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19373; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19374; GFX10-CU-NEXT:    buffer_gl1_inv
19375; GFX10-CU-NEXT:    buffer_gl0_inv
19376; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19377; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19378; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19379; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19380; GFX10-CU-NEXT:    s_endpgm
19381;
19382; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19383; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19384; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19386; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19387; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19388; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19389; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19390; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19391; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19392; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19393; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19394; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19395; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19396; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19397; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19398; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19399; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19400; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19401; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19402; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19403; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19404; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19405; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19406; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19409; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19410; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19411; SKIP-CACHE-INV-NEXT:    s_endpgm
19412;
19413; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19414; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19415; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19416; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19417; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19418; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19419; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19420; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19421; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19422; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19423; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19424; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
19425; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19426; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19427; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19428; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
19429; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
19430; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19431; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19432; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19433; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19434;
19435; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19436; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19437; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19438; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19439; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19440; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19441; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19442; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19443; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19444; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19445; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19446; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
19447; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19448; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19449; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19450; GFX90A-TGSPLIT-NEXT:    buffer_invl2
19451; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
19452; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19453; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19454; GFX90A-TGSPLIT-NEXT:    s_endpgm
19455;
19456; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19457; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19458; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19459; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19460; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19461; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19462; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19463; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19464; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19465; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19466; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19467; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19468; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19469; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19470; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19471; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
19472; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19473; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19474; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19475; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19476;
19477; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19478; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19479; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19480; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19481; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19482; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19483; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19484; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19485; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19486; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19487; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19488; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19489; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19490; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19491; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19492; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
19493; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19494; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19495; GFX940-TGSPLIT-NEXT:    s_endpgm
19496;
19497; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19498; GFX11-WGP:       ; %bb.0: ; %entry
19499; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19500; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19501; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19502; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19503; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19504; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19505; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19506; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19507; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19508; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19509; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19510; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19511; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19512; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19513; GFX11-WGP-NEXT:    buffer_gl1_inv
19514; GFX11-WGP-NEXT:    buffer_gl0_inv
19515; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19516; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19517; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19518; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19519; GFX11-WGP-NEXT:    s_endpgm
19520;
19521; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19522; GFX11-CU:       ; %bb.0: ; %entry
19523; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19524; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19525; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19526; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19527; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19528; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19529; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19530; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19531; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19532; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19533; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19534; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19535; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19536; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19537; GFX11-CU-NEXT:    buffer_gl1_inv
19538; GFX11-CU-NEXT:    buffer_gl0_inv
19539; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19540; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19541; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19542; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19543; GFX11-CU-NEXT:    s_endpgm
19544;
19545; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19546; GFX12-WGP:       ; %bb.0: ; %entry
19547; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19548; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19549; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19550; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19551; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19552; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19553; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19554; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19555; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19556; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19557; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
19558; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19559; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19560; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19561; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19562; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19563; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19564; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19565; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19566; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
19567; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19568; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19569; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19570; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19571; GFX12-WGP-NEXT:    s_endpgm
19572;
19573; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
19574; GFX12-CU:       ; %bb.0: ; %entry
19575; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19576; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19577; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19578; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19579; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19580; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19581; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19582; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19583; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19584; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19585; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
19586; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
19587; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
19588; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19589; GFX12-CU-NEXT:    s_wait_storecnt 0x0
19590; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19591; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
19592; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
19593; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19594; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
19595; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19596; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19597; GFX12-CU-NEXT:    s_wait_dscnt 0x0
19598; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19599; GFX12-CU-NEXT:    s_endpgm
19600    ptr %out, i32 %in, i32 %old) {
19601entry:
19602  %gep = getelementptr i32, ptr %out, i32 4
19603  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
19604  %val0 = extractvalue { i32, i1 } %val, 0
19605  store i32 %val0, ptr %out, align 4
19606  ret void
19607}
19608
19609define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
19610; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19611; GFX7:       ; %bb.0: ; %entry
19612; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19613; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19614; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19615; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19616; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19617; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19618; GFX7-NEXT:    s_mov_b32 s6, s4
19619; GFX7-NEXT:    s_mov_b32 s7, s5
19620; GFX7-NEXT:    s_mov_b32 s11, s12
19621; GFX7-NEXT:    s_mov_b32 s10, s13
19622; GFX7-NEXT:    s_add_u32 s6, s6, s11
19623; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19624; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19625; GFX7-NEXT:    s_mov_b32 s7, s10
19626; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19627; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19628; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19629; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19630; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19631; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19632; GFX7-NEXT:    s_waitcnt vmcnt(0)
19633; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19634; GFX7-NEXT:    s_waitcnt vmcnt(0)
19635; GFX7-NEXT:    buffer_wbinvl1_vol
19636; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19637; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19638; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19639; GFX7-NEXT:    flat_store_dword v[0:1], v2
19640; GFX7-NEXT:    s_endpgm
19641;
19642; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19643; GFX10-WGP:       ; %bb.0: ; %entry
19644; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19645; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19646; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19647; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19648; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19649; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19650; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19651; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19652; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19653; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19654; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19655; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19656; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19657; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19658; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19659; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19660; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19661; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19662; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19663; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19664; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19665; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19666; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19667; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19668; GFX10-WGP-NEXT:    buffer_gl1_inv
19669; GFX10-WGP-NEXT:    buffer_gl0_inv
19670; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19671; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19672; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19673; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19674; GFX10-WGP-NEXT:    s_endpgm
19675;
19676; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19677; GFX10-CU:       ; %bb.0: ; %entry
19678; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19679; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19680; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19681; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19682; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19683; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19684; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19685; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19686; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19687; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19688; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19689; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19690; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19691; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19692; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19693; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19694; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19695; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19696; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19697; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19698; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19699; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19700; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19701; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19702; GFX10-CU-NEXT:    buffer_gl1_inv
19703; GFX10-CU-NEXT:    buffer_gl0_inv
19704; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19705; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19706; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19707; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19708; GFX10-CU-NEXT:    s_endpgm
19709;
19710; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19711; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19712; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19713; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19714; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19715; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19716; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19717; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19718; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19719; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19720; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19721; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19722; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19723; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19724; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19725; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19726; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19727; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19728; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19729; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19730; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19731; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19732; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19733; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19734; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19735; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19737; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19738; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19739; SKIP-CACHE-INV-NEXT:    s_endpgm
19740;
19741; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19742; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19743; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19744; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19745; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19746; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19747; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19748; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19749; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19750; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19751; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19752; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
19753; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19754; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19755; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19756; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
19757; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
19758; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19759; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19760; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19761; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19762;
19763; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19764; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19765; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19766; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19767; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19768; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19769; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19770; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19771; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19772; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19773; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19774; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
19775; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19776; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19777; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19778; GFX90A-TGSPLIT-NEXT:    buffer_invl2
19779; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
19780; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19781; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19782; GFX90A-TGSPLIT-NEXT:    s_endpgm
19783;
19784; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19785; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19786; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19787; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19788; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19789; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19790; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19791; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19792; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19793; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19794; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19795; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19796; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19797; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19798; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19799; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
19800; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19801; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19802; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19803; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19804;
19805; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19806; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19807; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19808; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19809; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19810; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19811; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19812; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19813; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19814; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19815; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19816; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
19817; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19818; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
19819; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19820; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
19821; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19822; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19823; GFX940-TGSPLIT-NEXT:    s_endpgm
19824;
19825; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19826; GFX11-WGP:       ; %bb.0: ; %entry
19827; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19828; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19829; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19830; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19831; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19832; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19833; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19834; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19835; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19836; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19837; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19838; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
19839; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19840; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19841; GFX11-WGP-NEXT:    buffer_gl1_inv
19842; GFX11-WGP-NEXT:    buffer_gl0_inv
19843; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19844; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19845; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19846; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19847; GFX11-WGP-NEXT:    s_endpgm
19848;
19849; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19850; GFX11-CU:       ; %bb.0: ; %entry
19851; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19852; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19853; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19854; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19855; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19856; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19857; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19858; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19859; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19860; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19861; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19862; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
19863; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19864; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19865; GFX11-CU-NEXT:    buffer_gl1_inv
19866; GFX11-CU-NEXT:    buffer_gl0_inv
19867; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19868; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19869; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19870; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19871; GFX11-CU-NEXT:    s_endpgm
19872;
19873; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19874; GFX12-WGP:       ; %bb.0: ; %entry
19875; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19876; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19877; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19878; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19879; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19880; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19881; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19882; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19883; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19884; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19885; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
19886; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19887; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19888; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19889; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
19890; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19891; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
19892; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
19893; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19894; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
19895; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19896; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19897; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
19898; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19899; GFX12-WGP-NEXT:    s_endpgm
19900;
19901; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
19902; GFX12-CU:       ; %bb.0: ; %entry
19903; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19904; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19905; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19906; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19907; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19908; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19909; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19910; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19911; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19912; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19913; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
19914; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
19915; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
19916; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19917; GFX12-CU-NEXT:    s_wait_storecnt 0x0
19918; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
19919; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
19920; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
19921; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19922; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
19923; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19924; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19925; GFX12-CU-NEXT:    s_wait_dscnt 0x0
19926; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19927; GFX12-CU-NEXT:    s_endpgm
19928    ptr %out, i32 %in, i32 %old) {
19929entry:
19930  %gep = getelementptr i32, ptr %out, i32 4
19931  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
19932  %val0 = extractvalue { i32, i1 } %val, 0
19933  store i32 %val0, ptr %out, align 4
19934  ret void
19935}
19936
19937define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
19938; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
19939; GFX7:       ; %bb.0: ; %entry
19940; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19941; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19942; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19943; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19944; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19945; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19946; GFX7-NEXT:    s_mov_b32 s6, s4
19947; GFX7-NEXT:    s_mov_b32 s7, s5
19948; GFX7-NEXT:    s_mov_b32 s11, s12
19949; GFX7-NEXT:    s_mov_b32 s10, s13
19950; GFX7-NEXT:    s_add_u32 s6, s6, s11
19951; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19952; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19953; GFX7-NEXT:    s_mov_b32 s7, s10
19954; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19955; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19956; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19957; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19958; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19959; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19960; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19961; GFX7-NEXT:    s_waitcnt vmcnt(0)
19962; GFX7-NEXT:    buffer_wbinvl1_vol
19963; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19964; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19965; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19966; GFX7-NEXT:    flat_store_dword v[0:1], v2
19967; GFX7-NEXT:    s_endpgm
19968;
19969; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
19970; GFX10-WGP:       ; %bb.0: ; %entry
19971; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19972; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19973; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19974; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19975; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19976; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19977; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19978; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19979; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19980; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19981; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19982; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19983; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19984; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19985; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19986; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19987; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19988; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19989; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19990; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19991; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19992; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19993; GFX10-WGP-NEXT:    buffer_gl1_inv
19994; GFX10-WGP-NEXT:    buffer_gl0_inv
19995; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19996; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19997; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19998; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19999; GFX10-WGP-NEXT:    s_endpgm
20000;
20001; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20002; GFX10-CU:       ; %bb.0: ; %entry
20003; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
20004; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20005; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
20006; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
20007; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
20008; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20009; GFX10-CU-NEXT:    s_mov_b32 s6, s4
20010; GFX10-CU-NEXT:    s_mov_b32 s7, s5
20011; GFX10-CU-NEXT:    s_mov_b32 s11, s12
20012; GFX10-CU-NEXT:    s_mov_b32 s10, s13
20013; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
20014; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
20015; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20016; GFX10-CU-NEXT:    s_mov_b32 s7, s10
20017; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
20018; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
20019; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20020; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
20021; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
20022; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
20023; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20024; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20025; GFX10-CU-NEXT:    buffer_gl1_inv
20026; GFX10-CU-NEXT:    buffer_gl0_inv
20027; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
20028; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
20029; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20030; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
20031; GFX10-CU-NEXT:    s_endpgm
20032;
20033; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20034; SKIP-CACHE-INV:       ; %bb.0: ; %entry
20035; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
20036; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
20037; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
20038; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
20039; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
20040; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20041; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
20042; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
20043; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
20044; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
20045; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
20046; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
20047; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20048; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
20049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
20050; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
20051; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
20053; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
20054; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
20055; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20056; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
20058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
20059; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20060; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
20061; SKIP-CACHE-INV-NEXT:    s_endpgm
20062;
20063; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20064; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
20065; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20066; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20067; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20068; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20069; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20070; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20071; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20072; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20073; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20074; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20075; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20076; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
20077; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
20078; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20079; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20080; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20081; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
20082;
20083; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20084; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
20085; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20086; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20087; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20088; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20089; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20090; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20091; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20092; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20093; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20094; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20095; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20096; GFX90A-TGSPLIT-NEXT:    buffer_invl2
20097; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
20098; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20099; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20100; GFX90A-TGSPLIT-NEXT:    s_endpgm
20101;
20102; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20103; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
20104; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20105; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20106; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20107; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20108; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20109; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20110; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20111; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20112; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20113; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20114; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20115; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
20116; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20117; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20118; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20119; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
20120;
20121; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20122; GFX940-TGSPLIT:       ; %bb.0: ; %entry
20123; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20124; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20125; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20126; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20127; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20128; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20129; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20130; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20131; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20132; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20133; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20134; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
20135; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20136; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20137; GFX940-TGSPLIT-NEXT:    s_endpgm
20138;
20139; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20140; GFX11-WGP:       ; %bb.0: ; %entry
20141; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20142; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20143; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20144; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20145; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
20146; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
20147; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20148; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
20149; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20150; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20151; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20152; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20153; GFX11-WGP-NEXT:    buffer_gl1_inv
20154; GFX11-WGP-NEXT:    buffer_gl0_inv
20155; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20156; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20157; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20158; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
20159; GFX11-WGP-NEXT:    s_endpgm
20160;
20161; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20162; GFX11-CU:       ; %bb.0: ; %entry
20163; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20164; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20165; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20166; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20167; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
20168; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
20169; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20170; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
20171; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20172; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20173; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20174; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
20175; GFX11-CU-NEXT:    buffer_gl1_inv
20176; GFX11-CU-NEXT:    buffer_gl0_inv
20177; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20178; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20179; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20180; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
20181; GFX11-CU-NEXT:    s_endpgm
20182;
20183; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20184; GFX12-WGP:       ; %bb.0: ; %entry
20185; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20186; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20187; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20188; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
20189; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
20190; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
20191; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20192; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
20193; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20194; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20195; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20196; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
20197; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
20198; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20199; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
20200; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20201; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20202; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
20203; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
20204; GFX12-WGP-NEXT:    s_endpgm
20205;
20206; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
20207; GFX12-CU:       ; %bb.0: ; %entry
20208; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20209; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20210; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20211; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
20212; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
20213; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
20214; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20215; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
20216; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20217; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20218; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20219; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
20220; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
20221; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
20222; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
20223; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20224; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20225; GFX12-CU-NEXT:    s_wait_dscnt 0x0
20226; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
20227; GFX12-CU-NEXT:    s_endpgm
20228    ptr %out, i32 %in, i32 %old) {
20229entry:
20230  %gep = getelementptr i32, ptr %out, i32 4
20231  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
20232  %val0 = extractvalue { i32, i1 } %val, 0
20233  store i32 %val0, ptr %out, align 4
20234  ret void
20235}
20236
20237define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
20238; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20239; GFX7:       ; %bb.0: ; %entry
20240; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
20241; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20242; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
20243; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
20244; GFX7-NEXT:    s_mov_b64 s[12:13], 16
20245; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20246; GFX7-NEXT:    s_mov_b32 s6, s4
20247; GFX7-NEXT:    s_mov_b32 s7, s5
20248; GFX7-NEXT:    s_mov_b32 s11, s12
20249; GFX7-NEXT:    s_mov_b32 s10, s13
20250; GFX7-NEXT:    s_add_u32 s6, s6, s11
20251; GFX7-NEXT:    s_addc_u32 s10, s7, s10
20252; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20253; GFX7-NEXT:    s_mov_b32 s7, s10
20254; GFX7-NEXT:    v_mov_b32_e32 v2, s9
20255; GFX7-NEXT:    v_mov_b32_e32 v0, s8
20256; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20257; GFX7-NEXT:    v_mov_b32_e32 v3, v0
20258; GFX7-NEXT:    v_mov_b32_e32 v0, s6
20259; GFX7-NEXT:    v_mov_b32_e32 v1, s7
20260; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20261; GFX7-NEXT:    s_waitcnt vmcnt(0)
20262; GFX7-NEXT:    buffer_wbinvl1_vol
20263; GFX7-NEXT:    v_mov_b32_e32 v0, s4
20264; GFX7-NEXT:    v_mov_b32_e32 v1, s5
20265; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20266; GFX7-NEXT:    flat_store_dword v[0:1], v2
20267; GFX7-NEXT:    s_endpgm
20268;
20269; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20270; GFX10-WGP:       ; %bb.0: ; %entry
20271; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
20272; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20273; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
20274; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
20275; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
20276; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20277; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
20278; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
20279; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
20280; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
20281; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
20282; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
20283; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20284; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
20285; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
20286; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
20287; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20288; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
20289; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
20290; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
20291; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20292; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20293; GFX10-WGP-NEXT:    buffer_gl1_inv
20294; GFX10-WGP-NEXT:    buffer_gl0_inv
20295; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
20296; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
20297; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20298; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
20299; GFX10-WGP-NEXT:    s_endpgm
20300;
20301; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20302; GFX10-CU:       ; %bb.0: ; %entry
20303; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
20304; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20305; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
20306; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
20307; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
20308; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20309; GFX10-CU-NEXT:    s_mov_b32 s6, s4
20310; GFX10-CU-NEXT:    s_mov_b32 s7, s5
20311; GFX10-CU-NEXT:    s_mov_b32 s11, s12
20312; GFX10-CU-NEXT:    s_mov_b32 s10, s13
20313; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
20314; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
20315; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20316; GFX10-CU-NEXT:    s_mov_b32 s7, s10
20317; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
20318; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
20319; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20320; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
20321; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
20322; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
20323; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20324; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20325; GFX10-CU-NEXT:    buffer_gl1_inv
20326; GFX10-CU-NEXT:    buffer_gl0_inv
20327; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
20328; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
20329; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20330; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
20331; GFX10-CU-NEXT:    s_endpgm
20332;
20333; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20334; SKIP-CACHE-INV:       ; %bb.0: ; %entry
20335; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
20336; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
20337; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
20338; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
20339; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
20340; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20341; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
20342; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
20343; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
20344; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
20345; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
20346; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
20347; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20348; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
20349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
20350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
20351; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20352; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
20353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
20354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
20355; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20356; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20357; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
20358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
20359; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20360; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
20361; SKIP-CACHE-INV-NEXT:    s_endpgm
20362;
20363; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20364; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
20365; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20366; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20367; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20368; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20369; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20370; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20371; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20372; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20373; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20374; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20375; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20376; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
20377; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
20378; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20379; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20380; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20381; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
20382;
20383; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20384; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
20385; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20386; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20387; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20388; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20389; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20390; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20391; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20392; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20393; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20394; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20395; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20396; GFX90A-TGSPLIT-NEXT:    buffer_invl2
20397; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
20398; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20399; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20400; GFX90A-TGSPLIT-NEXT:    s_endpgm
20401;
20402; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20403; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
20404; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20405; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20406; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20407; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20408; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20409; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20410; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20411; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20412; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20413; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20414; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20415; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
20416; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20417; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20418; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20419; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
20420;
20421; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20422; GFX940-TGSPLIT:       ; %bb.0: ; %entry
20423; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20424; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20425; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20426; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20427; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20428; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20429; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20430; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20431; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20432; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20433; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20434; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
20435; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20436; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20437; GFX940-TGSPLIT-NEXT:    s_endpgm
20438;
20439; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20440; GFX11-WGP:       ; %bb.0: ; %entry
20441; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20442; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20443; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20444; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20445; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
20446; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
20447; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20448; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
20449; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20450; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20451; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20452; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20453; GFX11-WGP-NEXT:    buffer_gl1_inv
20454; GFX11-WGP-NEXT:    buffer_gl0_inv
20455; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20456; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20457; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20458; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
20459; GFX11-WGP-NEXT:    s_endpgm
20460;
20461; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20462; GFX11-CU:       ; %bb.0: ; %entry
20463; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20464; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20465; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20466; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20467; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
20468; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
20469; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20470; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
20471; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20472; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20473; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20474; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
20475; GFX11-CU-NEXT:    buffer_gl1_inv
20476; GFX11-CU-NEXT:    buffer_gl0_inv
20477; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20478; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20479; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20480; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
20481; GFX11-CU-NEXT:    s_endpgm
20482;
20483; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20484; GFX12-WGP:       ; %bb.0: ; %entry
20485; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20486; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20487; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20488; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
20489; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
20490; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
20491; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20492; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
20493; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20494; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20495; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20496; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20497; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
20498; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20499; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20500; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
20501; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
20502; GFX12-WGP-NEXT:    s_endpgm
20503;
20504; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
20505; GFX12-CU:       ; %bb.0: ; %entry
20506; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20507; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20508; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20509; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
20510; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
20511; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
20512; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20513; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
20514; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20515; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20516; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20517; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
20518; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
20519; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20520; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20521; GFX12-CU-NEXT:    s_wait_dscnt 0x0
20522; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
20523; GFX12-CU-NEXT:    s_endpgm
20524    ptr %out, i32 %in, i32 %old) {
20525entry:
20526  %gep = getelementptr i32, ptr %out, i32 4
20527  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
20528  %val0 = extractvalue { i32, i1 } %val, 0
20529  store i32 %val0, ptr %out, align 4
20530  ret void
20531}
20532
20533define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
20534; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20535; GFX7:       ; %bb.0: ; %entry
20536; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
20537; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20538; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
20539; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
20540; GFX7-NEXT:    s_mov_b64 s[12:13], 16
20541; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20542; GFX7-NEXT:    s_mov_b32 s6, s4
20543; GFX7-NEXT:    s_mov_b32 s7, s5
20544; GFX7-NEXT:    s_mov_b32 s11, s12
20545; GFX7-NEXT:    s_mov_b32 s10, s13
20546; GFX7-NEXT:    s_add_u32 s6, s6, s11
20547; GFX7-NEXT:    s_addc_u32 s10, s7, s10
20548; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20549; GFX7-NEXT:    s_mov_b32 s7, s10
20550; GFX7-NEXT:    v_mov_b32_e32 v2, s9
20551; GFX7-NEXT:    v_mov_b32_e32 v0, s8
20552; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20553; GFX7-NEXT:    v_mov_b32_e32 v3, v0
20554; GFX7-NEXT:    v_mov_b32_e32 v0, s6
20555; GFX7-NEXT:    v_mov_b32_e32 v1, s7
20556; GFX7-NEXT:    s_waitcnt vmcnt(0)
20557; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20558; GFX7-NEXT:    s_waitcnt vmcnt(0)
20559; GFX7-NEXT:    buffer_wbinvl1_vol
20560; GFX7-NEXT:    v_mov_b32_e32 v0, s4
20561; GFX7-NEXT:    v_mov_b32_e32 v1, s5
20562; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20563; GFX7-NEXT:    flat_store_dword v[0:1], v2
20564; GFX7-NEXT:    s_endpgm
20565;
20566; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20567; GFX10-WGP:       ; %bb.0: ; %entry
20568; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
20569; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20570; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
20571; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
20572; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
20573; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20574; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
20575; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
20576; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
20577; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
20578; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
20579; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
20580; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20581; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
20582; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
20583; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
20584; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20585; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
20586; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
20587; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
20588; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20589; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
20590; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20591; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20592; GFX10-WGP-NEXT:    buffer_gl1_inv
20593; GFX10-WGP-NEXT:    buffer_gl0_inv
20594; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
20595; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
20596; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20597; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
20598; GFX10-WGP-NEXT:    s_endpgm
20599;
20600; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20601; GFX10-CU:       ; %bb.0: ; %entry
20602; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
20603; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20604; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
20605; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
20606; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
20607; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20608; GFX10-CU-NEXT:    s_mov_b32 s6, s4
20609; GFX10-CU-NEXT:    s_mov_b32 s7, s5
20610; GFX10-CU-NEXT:    s_mov_b32 s11, s12
20611; GFX10-CU-NEXT:    s_mov_b32 s10, s13
20612; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
20613; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
20614; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20615; GFX10-CU-NEXT:    s_mov_b32 s7, s10
20616; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
20617; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
20618; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20619; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
20620; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
20621; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
20622; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20623; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
20624; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20625; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20626; GFX10-CU-NEXT:    buffer_gl1_inv
20627; GFX10-CU-NEXT:    buffer_gl0_inv
20628; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
20629; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
20630; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20631; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
20632; GFX10-CU-NEXT:    s_endpgm
20633;
20634; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20635; SKIP-CACHE-INV:       ; %bb.0: ; %entry
20636; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
20637; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
20638; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
20639; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
20640; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
20641; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20642; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
20643; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
20644; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
20645; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
20646; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
20647; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
20648; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20649; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
20650; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
20651; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
20652; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20653; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
20654; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
20655; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
20656; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20657; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20658; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20659; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
20660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
20661; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20662; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
20663; SKIP-CACHE-INV-NEXT:    s_endpgm
20664;
20665; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20666; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
20667; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20668; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20669; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20670; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20671; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20672; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20673; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20674; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20675; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20676; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
20677; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20678; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20679; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20680; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
20681; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
20682; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20683; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20684; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20685; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
20686;
20687; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20688; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
20689; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20690; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20691; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20692; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20693; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
20694; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
20695; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20696; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20697; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20698; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
20699; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20700; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20701; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20702; GFX90A-TGSPLIT-NEXT:    buffer_invl2
20703; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
20704; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20705; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
20706; GFX90A-TGSPLIT-NEXT:    s_endpgm
20707;
20708; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20709; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
20710; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20711; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20712; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20713; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20714; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20715; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20716; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20717; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20718; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20719; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
20720; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20721; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20722; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20723; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
20724; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20725; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20726; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20727; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
20728;
20729; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20730; GFX940-TGSPLIT:       ; %bb.0: ; %entry
20731; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
20732; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
20733; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
20734; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20735; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
20736; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
20737; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20738; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
20739; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20740; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
20741; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20742; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
20743; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
20744; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
20745; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
20746; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
20747; GFX940-TGSPLIT-NEXT:    s_endpgm
20748;
20749; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20750; GFX11-WGP:       ; %bb.0: ; %entry
20751; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20752; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20753; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20754; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20755; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
20756; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
20757; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20758; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
20759; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20760; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20761; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20762; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
20763; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20764; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
20765; GFX11-WGP-NEXT:    buffer_gl1_inv
20766; GFX11-WGP-NEXT:    buffer_gl0_inv
20767; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
20768; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
20769; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20770; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
20771; GFX11-WGP-NEXT:    s_endpgm
20772;
20773; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20774; GFX11-CU:       ; %bb.0: ; %entry
20775; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20776; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20777; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20778; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20779; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
20780; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
20781; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20782; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
20783; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20784; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20785; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
20786; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
20787; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20788; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
20789; GFX11-CU-NEXT:    buffer_gl1_inv
20790; GFX11-CU-NEXT:    buffer_gl0_inv
20791; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
20792; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
20793; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
20794; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
20795; GFX11-CU-NEXT:    s_endpgm
20796;
20797; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20798; GFX12-WGP:       ; %bb.0: ; %entry
20799; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20800; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
20801; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
20802; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
20803; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
20804; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
20805; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20806; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
20807; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20808; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20809; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
20810; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
20811; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
20812; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20813; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
20814; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20815; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
20816; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
20817; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
20818; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
20819; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
20820; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
20821; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
20822; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
20823; GFX12-WGP-NEXT:    s_endpgm
20824;
20825; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
20826; GFX12-CU:       ; %bb.0: ; %entry
20827; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
20828; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
20829; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
20830; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
20831; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
20832; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
20833; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20834; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
20835; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20836; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20837; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
20838; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
20839; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
20840; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
20841; GFX12-CU-NEXT:    s_wait_storecnt 0x0
20842; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20843; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
20844; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
20845; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
20846; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
20847; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
20848; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
20849; GFX12-CU-NEXT:    s_wait_dscnt 0x0
20850; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
20851; GFX12-CU-NEXT:    s_endpgm
20852    ptr %out, i32 %in, i32 %old) {
20853entry:
20854  %gep = getelementptr i32, ptr %out, i32 4
20855  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
20856  %val0 = extractvalue { i32, i1 } %val, 0
20857  store i32 %val0, ptr %out, align 4
20858  ret void
20859}
20860
20861define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
20862; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
20863; GFX7:       ; %bb.0: ; %entry
20864; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
20865; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20866; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
20867; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
20868; GFX7-NEXT:    s_mov_b64 s[12:13], 16
20869; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20870; GFX7-NEXT:    s_mov_b32 s6, s4
20871; GFX7-NEXT:    s_mov_b32 s7, s5
20872; GFX7-NEXT:    s_mov_b32 s11, s12
20873; GFX7-NEXT:    s_mov_b32 s10, s13
20874; GFX7-NEXT:    s_add_u32 s6, s6, s11
20875; GFX7-NEXT:    s_addc_u32 s10, s7, s10
20876; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20877; GFX7-NEXT:    s_mov_b32 s7, s10
20878; GFX7-NEXT:    v_mov_b32_e32 v2, s9
20879; GFX7-NEXT:    v_mov_b32_e32 v0, s8
20880; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20881; GFX7-NEXT:    v_mov_b32_e32 v3, v0
20882; GFX7-NEXT:    v_mov_b32_e32 v0, s6
20883; GFX7-NEXT:    v_mov_b32_e32 v1, s7
20884; GFX7-NEXT:    s_waitcnt vmcnt(0)
20885; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20886; GFX7-NEXT:    s_waitcnt vmcnt(0)
20887; GFX7-NEXT:    buffer_wbinvl1_vol
20888; GFX7-NEXT:    v_mov_b32_e32 v0, s4
20889; GFX7-NEXT:    v_mov_b32_e32 v1, s5
20890; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20891; GFX7-NEXT:    flat_store_dword v[0:1], v2
20892; GFX7-NEXT:    s_endpgm
20893;
20894; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
20895; GFX10-WGP:       ; %bb.0: ; %entry
20896; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
20897; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20898; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
20899; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
20900; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
20901; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20902; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
20903; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
20904; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
20905; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
20906; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
20907; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
20908; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20909; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
20910; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
20911; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
20912; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20913; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
20914; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
20915; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
20916; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20917; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
20918; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20919; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
20920; GFX10-WGP-NEXT:    buffer_gl1_inv
20921; GFX10-WGP-NEXT:    buffer_gl0_inv
20922; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
20923; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
20924; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
20925; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
20926; GFX10-WGP-NEXT:    s_endpgm
20927;
20928; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
20929; GFX10-CU:       ; %bb.0: ; %entry
20930; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
20931; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
20932; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
20933; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
20934; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
20935; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20936; GFX10-CU-NEXT:    s_mov_b32 s6, s4
20937; GFX10-CU-NEXT:    s_mov_b32 s7, s5
20938; GFX10-CU-NEXT:    s_mov_b32 s11, s12
20939; GFX10-CU-NEXT:    s_mov_b32 s10, s13
20940; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
20941; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
20942; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20943; GFX10-CU-NEXT:    s_mov_b32 s7, s10
20944; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
20945; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
20946; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20947; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
20948; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
20949; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
20950; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20951; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
20952; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20953; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
20954; GFX10-CU-NEXT:    buffer_gl1_inv
20955; GFX10-CU-NEXT:    buffer_gl0_inv
20956; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
20957; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
20958; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
20959; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
20960; GFX10-CU-NEXT:    s_endpgm
20961;
20962; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
20963; SKIP-CACHE-INV:       ; %bb.0: ; %entry
20964; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
20965; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
20966; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
20967; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
20968; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
20969; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20970; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
20971; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
20972; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
20973; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
20974; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
20975; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
20976; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20977; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
20978; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
20979; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
20980; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20981; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
20982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
20983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
20984; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20985; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20986; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
20987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
20988; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
20989; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
20990; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
20991; SKIP-CACHE-INV-NEXT:    s_endpgm
20992;
20993; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
20994; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
20995; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
20996; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
20997; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
20998; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
20999; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21000; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21001; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21002; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21003; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21004; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
21005; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21006; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21007; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21008; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
21009; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
21010; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21011; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21012; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21013; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
21014;
21015; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21016; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
21017; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21018; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21019; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21020; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21021; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21022; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21023; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21024; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21025; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21026; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
21027; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21028; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21029; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21030; GFX90A-TGSPLIT-NEXT:    buffer_invl2
21031; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
21032; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21033; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21034; GFX90A-TGSPLIT-NEXT:    s_endpgm
21035;
21036; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21037; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
21038; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21039; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21040; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21041; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21042; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21043; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21044; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21045; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21046; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21047; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21048; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21049; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21050; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21051; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
21052; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21053; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21054; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21055; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
21056;
21057; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21058; GFX940-TGSPLIT:       ; %bb.0: ; %entry
21059; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21060; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21061; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21062; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21063; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21064; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21065; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21066; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21067; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21068; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21069; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21070; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21071; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21072; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
21073; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21074; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21075; GFX940-TGSPLIT-NEXT:    s_endpgm
21076;
21077; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21078; GFX11-WGP:       ; %bb.0: ; %entry
21079; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21080; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21081; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21082; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21083; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
21084; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
21085; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21086; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
21087; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21088; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21089; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21090; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21091; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21092; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21093; GFX11-WGP-NEXT:    buffer_gl1_inv
21094; GFX11-WGP-NEXT:    buffer_gl0_inv
21095; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21096; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21097; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21098; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
21099; GFX11-WGP-NEXT:    s_endpgm
21100;
21101; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21102; GFX11-CU:       ; %bb.0: ; %entry
21103; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21104; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21105; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21106; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21107; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
21108; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
21109; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21110; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
21111; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21112; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21113; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21114; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21115; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21116; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21117; GFX11-CU-NEXT:    buffer_gl1_inv
21118; GFX11-CU-NEXT:    buffer_gl0_inv
21119; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21120; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21121; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21122; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
21123; GFX11-CU-NEXT:    s_endpgm
21124;
21125; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21126; GFX12-WGP:       ; %bb.0: ; %entry
21127; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21128; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21129; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21130; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
21131; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
21132; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
21133; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21134; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
21135; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21136; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21137; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
21138; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21139; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21140; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21141; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
21142; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21143; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21144; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21145; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21146; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
21147; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21148; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21149; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
21150; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
21151; GFX12-WGP-NEXT:    s_endpgm
21152;
21153; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
21154; GFX12-CU:       ; %bb.0: ; %entry
21155; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21156; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21157; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21158; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
21159; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
21160; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
21161; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21162; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
21163; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21164; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21165; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
21166; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21167; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21168; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21169; GFX12-CU-NEXT:    s_wait_storecnt 0x0
21170; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21171; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21172; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21173; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21174; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
21175; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21176; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21177; GFX12-CU-NEXT:    s_wait_dscnt 0x0
21178; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
21179; GFX12-CU-NEXT:    s_endpgm
21180    ptr %out, i32 %in, i32 %old) {
21181entry:
21182  %gep = getelementptr i32, ptr %out, i32 4
21183  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
21184  %val0 = extractvalue { i32, i1 } %val, 0
21185  store i32 %val0, ptr %out, align 4
21186  ret void
21187}
21188
21189define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
21190; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21191; GFX7:       ; %bb.0: ; %entry
21192; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
21193; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21194; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
21195; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
21196; GFX7-NEXT:    s_mov_b64 s[12:13], 16
21197; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21198; GFX7-NEXT:    s_mov_b32 s6, s4
21199; GFX7-NEXT:    s_mov_b32 s7, s5
21200; GFX7-NEXT:    s_mov_b32 s11, s12
21201; GFX7-NEXT:    s_mov_b32 s10, s13
21202; GFX7-NEXT:    s_add_u32 s6, s6, s11
21203; GFX7-NEXT:    s_addc_u32 s10, s7, s10
21204; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21205; GFX7-NEXT:    s_mov_b32 s7, s10
21206; GFX7-NEXT:    v_mov_b32_e32 v2, s9
21207; GFX7-NEXT:    v_mov_b32_e32 v0, s8
21208; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21209; GFX7-NEXT:    v_mov_b32_e32 v3, v0
21210; GFX7-NEXT:    v_mov_b32_e32 v0, s6
21211; GFX7-NEXT:    v_mov_b32_e32 v1, s7
21212; GFX7-NEXT:    s_waitcnt vmcnt(0)
21213; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21214; GFX7-NEXT:    s_waitcnt vmcnt(0)
21215; GFX7-NEXT:    buffer_wbinvl1_vol
21216; GFX7-NEXT:    v_mov_b32_e32 v0, s4
21217; GFX7-NEXT:    v_mov_b32_e32 v1, s5
21218; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21219; GFX7-NEXT:    flat_store_dword v[0:1], v2
21220; GFX7-NEXT:    s_endpgm
21221;
21222; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21223; GFX10-WGP:       ; %bb.0: ; %entry
21224; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
21225; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21226; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
21227; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
21228; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
21229; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21230; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
21231; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
21232; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
21233; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
21234; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
21235; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
21236; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21237; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
21238; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
21239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
21240; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21241; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
21242; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
21243; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
21244; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21245; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21246; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21247; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21248; GFX10-WGP-NEXT:    buffer_gl1_inv
21249; GFX10-WGP-NEXT:    buffer_gl0_inv
21250; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
21251; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
21252; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21253; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
21254; GFX10-WGP-NEXT:    s_endpgm
21255;
21256; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21257; GFX10-CU:       ; %bb.0: ; %entry
21258; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
21259; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21260; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
21261; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
21262; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
21263; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21264; GFX10-CU-NEXT:    s_mov_b32 s6, s4
21265; GFX10-CU-NEXT:    s_mov_b32 s7, s5
21266; GFX10-CU-NEXT:    s_mov_b32 s11, s12
21267; GFX10-CU-NEXT:    s_mov_b32 s10, s13
21268; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
21269; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
21270; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21271; GFX10-CU-NEXT:    s_mov_b32 s7, s10
21272; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
21273; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
21274; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21275; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
21276; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
21277; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
21278; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21279; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21280; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21281; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21282; GFX10-CU-NEXT:    buffer_gl1_inv
21283; GFX10-CU-NEXT:    buffer_gl0_inv
21284; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
21285; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
21286; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21287; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
21288; GFX10-CU-NEXT:    s_endpgm
21289;
21290; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21291; SKIP-CACHE-INV:       ; %bb.0: ; %entry
21292; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
21293; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
21294; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
21295; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
21296; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
21297; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21298; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
21299; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
21300; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
21301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
21302; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
21303; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
21304; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
21305; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
21306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
21307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
21308; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
21310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
21311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
21312; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21313; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21314; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
21316; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
21317; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21318; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
21319; SKIP-CACHE-INV-NEXT:    s_endpgm
21320;
21321; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21322; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
21323; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21324; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21325; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21326; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21327; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21328; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21329; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21330; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21331; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21332; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
21333; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21334; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21335; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21336; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
21337; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
21338; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21339; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21340; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21341; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
21342;
21343; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21344; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
21345; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21346; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21347; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21348; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21349; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21350; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21351; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21352; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21353; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21354; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
21355; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21356; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21357; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21358; GFX90A-TGSPLIT-NEXT:    buffer_invl2
21359; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
21360; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21361; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21362; GFX90A-TGSPLIT-NEXT:    s_endpgm
21363;
21364; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21365; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
21366; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21367; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21368; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21369; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21370; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21371; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21372; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21373; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21374; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21375; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21376; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21377; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21378; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21379; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
21380; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21381; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21382; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21383; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
21384;
21385; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21386; GFX940-TGSPLIT:       ; %bb.0: ; %entry
21387; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21388; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21389; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21390; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21391; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21392; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21393; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21394; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21395; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21396; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21397; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21398; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21399; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21400; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
21401; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21402; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21403; GFX940-TGSPLIT-NEXT:    s_endpgm
21404;
21405; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21406; GFX11-WGP:       ; %bb.0: ; %entry
21407; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21408; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21409; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21410; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21411; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
21412; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
21413; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21414; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
21415; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21416; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21417; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21418; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21419; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21420; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21421; GFX11-WGP-NEXT:    buffer_gl1_inv
21422; GFX11-WGP-NEXT:    buffer_gl0_inv
21423; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21424; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21425; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21426; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
21427; GFX11-WGP-NEXT:    s_endpgm
21428;
21429; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21430; GFX11-CU:       ; %bb.0: ; %entry
21431; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21432; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21433; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21434; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21435; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
21436; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
21437; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21438; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
21439; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21440; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21441; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21442; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21443; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21444; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21445; GFX11-CU-NEXT:    buffer_gl1_inv
21446; GFX11-CU-NEXT:    buffer_gl0_inv
21447; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21448; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21449; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21450; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
21451; GFX11-CU-NEXT:    s_endpgm
21452;
21453; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21454; GFX12-WGP:       ; %bb.0: ; %entry
21455; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21456; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21457; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21458; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
21459; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
21460; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
21461; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21462; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
21463; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21464; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21465; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
21466; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21467; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21468; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21469; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
21470; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21471; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21472; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21473; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21474; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
21475; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21476; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21477; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
21478; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
21479; GFX12-WGP-NEXT:    s_endpgm
21480;
21481; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
21482; GFX12-CU:       ; %bb.0: ; %entry
21483; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21484; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21485; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21486; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
21487; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
21488; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
21489; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21490; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
21491; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21492; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21493; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
21494; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21495; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21496; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21497; GFX12-CU-NEXT:    s_wait_storecnt 0x0
21498; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21499; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21500; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21501; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21502; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
21503; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21504; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21505; GFX12-CU-NEXT:    s_wait_dscnt 0x0
21506; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
21507; GFX12-CU-NEXT:    s_endpgm
21508    ptr %out, i32 %in, i32 %old) {
21509entry:
21510  %gep = getelementptr i32, ptr %out, i32 4
21511  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
21512  %val0 = extractvalue { i32, i1 } %val, 0
21513  store i32 %val0, ptr %out, align 4
21514  ret void
21515}
21516
21517define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
21518; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21519; GFX7:       ; %bb.0: ; %entry
21520; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
21521; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21522; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
21523; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
21524; GFX7-NEXT:    s_mov_b64 s[12:13], 16
21525; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21526; GFX7-NEXT:    s_mov_b32 s6, s4
21527; GFX7-NEXT:    s_mov_b32 s7, s5
21528; GFX7-NEXT:    s_mov_b32 s11, s12
21529; GFX7-NEXT:    s_mov_b32 s10, s13
21530; GFX7-NEXT:    s_add_u32 s6, s6, s11
21531; GFX7-NEXT:    s_addc_u32 s10, s7, s10
21532; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21533; GFX7-NEXT:    s_mov_b32 s7, s10
21534; GFX7-NEXT:    v_mov_b32_e32 v2, s9
21535; GFX7-NEXT:    v_mov_b32_e32 v0, s8
21536; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21537; GFX7-NEXT:    v_mov_b32_e32 v3, v0
21538; GFX7-NEXT:    v_mov_b32_e32 v0, s6
21539; GFX7-NEXT:    v_mov_b32_e32 v1, s7
21540; GFX7-NEXT:    s_waitcnt vmcnt(0)
21541; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21542; GFX7-NEXT:    s_waitcnt vmcnt(0)
21543; GFX7-NEXT:    buffer_wbinvl1_vol
21544; GFX7-NEXT:    v_mov_b32_e32 v0, s4
21545; GFX7-NEXT:    v_mov_b32_e32 v1, s5
21546; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21547; GFX7-NEXT:    flat_store_dword v[0:1], v2
21548; GFX7-NEXT:    s_endpgm
21549;
21550; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21551; GFX10-WGP:       ; %bb.0: ; %entry
21552; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
21553; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21554; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
21555; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
21556; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
21557; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21558; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
21559; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
21560; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
21561; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
21562; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
21563; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
21564; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21565; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
21566; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
21567; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
21568; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21569; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
21570; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
21571; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
21572; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21573; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21574; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21575; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21576; GFX10-WGP-NEXT:    buffer_gl1_inv
21577; GFX10-WGP-NEXT:    buffer_gl0_inv
21578; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
21579; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
21580; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21581; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
21582; GFX10-WGP-NEXT:    s_endpgm
21583;
21584; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21585; GFX10-CU:       ; %bb.0: ; %entry
21586; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
21587; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21588; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
21589; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
21590; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
21591; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21592; GFX10-CU-NEXT:    s_mov_b32 s6, s4
21593; GFX10-CU-NEXT:    s_mov_b32 s7, s5
21594; GFX10-CU-NEXT:    s_mov_b32 s11, s12
21595; GFX10-CU-NEXT:    s_mov_b32 s10, s13
21596; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
21597; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
21598; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21599; GFX10-CU-NEXT:    s_mov_b32 s7, s10
21600; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
21601; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
21602; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21603; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
21604; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
21605; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
21606; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21607; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21608; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21609; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21610; GFX10-CU-NEXT:    buffer_gl1_inv
21611; GFX10-CU-NEXT:    buffer_gl0_inv
21612; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
21613; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
21614; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21615; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
21616; GFX10-CU-NEXT:    s_endpgm
21617;
21618; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21619; SKIP-CACHE-INV:       ; %bb.0: ; %entry
21620; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
21621; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
21622; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
21623; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
21624; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
21625; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21626; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
21627; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
21628; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
21629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
21630; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
21631; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
21632; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
21633; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
21634; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
21635; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
21636; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
21638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
21639; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
21640; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21641; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21642; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
21644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
21645; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21646; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
21647; SKIP-CACHE-INV-NEXT:    s_endpgm
21648;
21649; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21650; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
21651; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21652; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21654; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21655; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21656; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21657; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21659; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21660; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
21661; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21662; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21663; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21664; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
21665; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
21666; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21667; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21668; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21669; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
21670;
21671; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21672; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
21673; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21674; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21675; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21676; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21677; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21679; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21680; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21681; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21682; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
21683; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21684; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21685; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21686; GFX90A-TGSPLIT-NEXT:    buffer_invl2
21687; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
21688; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21689; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21690; GFX90A-TGSPLIT-NEXT:    s_endpgm
21691;
21692; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21693; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
21694; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21695; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21696; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21697; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21698; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21699; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21700; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21701; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21702; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21703; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21704; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21705; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21706; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21707; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
21708; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21709; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21710; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21711; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
21712;
21713; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21714; GFX940-TGSPLIT:       ; %bb.0: ; %entry
21715; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
21716; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
21717; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
21718; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21719; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
21720; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
21721; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21722; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21723; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21724; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
21725; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21726; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
21727; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21728; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
21729; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
21730; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
21731; GFX940-TGSPLIT-NEXT:    s_endpgm
21732;
21733; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21734; GFX11-WGP:       ; %bb.0: ; %entry
21735; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21736; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21737; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21738; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21739; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
21740; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
21741; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21742; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
21743; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21744; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21745; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21746; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21747; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21748; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
21749; GFX11-WGP-NEXT:    buffer_gl1_inv
21750; GFX11-WGP-NEXT:    buffer_gl0_inv
21751; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
21752; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
21753; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21754; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
21755; GFX11-WGP-NEXT:    s_endpgm
21756;
21757; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21758; GFX11-CU:       ; %bb.0: ; %entry
21759; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21760; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21761; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21762; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21763; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
21764; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
21765; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21766; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
21767; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21768; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21769; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21770; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21771; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
21772; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
21773; GFX11-CU-NEXT:    buffer_gl1_inv
21774; GFX11-CU-NEXT:    buffer_gl0_inv
21775; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
21776; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
21777; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
21778; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
21779; GFX11-CU-NEXT:    s_endpgm
21780;
21781; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21782; GFX12-WGP:       ; %bb.0: ; %entry
21783; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21784; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
21785; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
21786; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
21787; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
21788; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
21789; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21790; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
21791; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21792; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21793; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
21794; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21795; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21796; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21797; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
21798; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21799; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
21800; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
21801; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
21802; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
21803; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
21804; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
21805; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
21806; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
21807; GFX12-WGP-NEXT:    s_endpgm
21808;
21809; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
21810; GFX12-CU:       ; %bb.0: ; %entry
21811; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
21812; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
21813; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
21814; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
21815; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
21816; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
21817; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21818; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
21819; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21820; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21821; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
21822; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21823; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21824; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21825; GFX12-CU-NEXT:    s_wait_storecnt 0x0
21826; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21827; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
21828; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
21829; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
21830; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
21831; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
21832; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
21833; GFX12-CU-NEXT:    s_wait_dscnt 0x0
21834; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
21835; GFX12-CU-NEXT:    s_endpgm
21836    ptr %out, i32 %in, i32 %old) {
21837entry:
21838  %gep = getelementptr i32, ptr %out, i32 4
21839  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
21840  %val0 = extractvalue { i32, i1 } %val, 0
21841  store i32 %val0, ptr %out, align 4
21842  ret void
21843}
21844
21845define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
21846; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
21847; GFX7:       ; %bb.0: ; %entry
21848; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
21849; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21850; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
21851; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
21852; GFX7-NEXT:    s_mov_b64 s[12:13], 16
21853; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21854; GFX7-NEXT:    s_mov_b32 s6, s4
21855; GFX7-NEXT:    s_mov_b32 s7, s5
21856; GFX7-NEXT:    s_mov_b32 s11, s12
21857; GFX7-NEXT:    s_mov_b32 s10, s13
21858; GFX7-NEXT:    s_add_u32 s6, s6, s11
21859; GFX7-NEXT:    s_addc_u32 s10, s7, s10
21860; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21861; GFX7-NEXT:    s_mov_b32 s7, s10
21862; GFX7-NEXT:    v_mov_b32_e32 v2, s9
21863; GFX7-NEXT:    v_mov_b32_e32 v0, s8
21864; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21865; GFX7-NEXT:    v_mov_b32_e32 v3, v0
21866; GFX7-NEXT:    v_mov_b32_e32 v0, s6
21867; GFX7-NEXT:    v_mov_b32_e32 v1, s7
21868; GFX7-NEXT:    s_waitcnt vmcnt(0)
21869; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21870; GFX7-NEXT:    s_waitcnt vmcnt(0)
21871; GFX7-NEXT:    buffer_wbinvl1_vol
21872; GFX7-NEXT:    v_mov_b32_e32 v0, s4
21873; GFX7-NEXT:    v_mov_b32_e32 v1, s5
21874; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21875; GFX7-NEXT:    flat_store_dword v[0:1], v2
21876; GFX7-NEXT:    s_endpgm
21877;
21878; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
21879; GFX10-WGP:       ; %bb.0: ; %entry
21880; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
21881; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21882; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
21883; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
21884; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
21885; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21886; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
21887; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
21888; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
21889; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
21890; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
21891; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
21892; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21893; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
21894; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
21895; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
21896; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21897; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
21898; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
21899; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
21900; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21901; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
21902; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21903; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
21904; GFX10-WGP-NEXT:    buffer_gl1_inv
21905; GFX10-WGP-NEXT:    buffer_gl0_inv
21906; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
21907; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
21908; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
21909; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
21910; GFX10-WGP-NEXT:    s_endpgm
21911;
21912; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
21913; GFX10-CU:       ; %bb.0: ; %entry
21914; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
21915; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
21916; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
21917; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
21918; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
21919; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21920; GFX10-CU-NEXT:    s_mov_b32 s6, s4
21921; GFX10-CU-NEXT:    s_mov_b32 s7, s5
21922; GFX10-CU-NEXT:    s_mov_b32 s11, s12
21923; GFX10-CU-NEXT:    s_mov_b32 s10, s13
21924; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
21925; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
21926; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
21927; GFX10-CU-NEXT:    s_mov_b32 s7, s10
21928; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
21929; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
21930; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21931; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
21932; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
21933; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
21934; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21935; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
21936; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21937; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
21938; GFX10-CU-NEXT:    buffer_gl1_inv
21939; GFX10-CU-NEXT:    buffer_gl0_inv
21940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
21941; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
21942; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
21943; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
21944; GFX10-CU-NEXT:    s_endpgm
21945;
21946; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
21947; SKIP-CACHE-INV:       ; %bb.0: ; %entry
21948; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
21949; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
21950; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
21951; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
21952; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
21953; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21954; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
21955; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
21956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
21957; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
21958; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
21959; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
21960; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
21961; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
21962; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
21963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
21964; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
21966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
21967; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
21968; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21969; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
21970; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
21971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
21972; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
21973; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
21974; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
21975; SKIP-CACHE-INV-NEXT:    s_endpgm
21976;
21977; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
21978; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
21979; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
21980; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
21981; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
21982; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21983; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
21984; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
21985; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
21986; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
21987; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21988; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
21989; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21990; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
21991; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
21992; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
21993; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
21994; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
21995; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
21996; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
21997; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
21998;
21999; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22000; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
22001; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22002; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22003; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22004; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22005; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22006; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22007; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22008; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22009; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22010; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
22011; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22012; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22013; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22014; GFX90A-TGSPLIT-NEXT:    buffer_invl2
22015; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
22016; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22017; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22018; GFX90A-TGSPLIT-NEXT:    s_endpgm
22019;
22020; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22021; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
22022; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22023; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22024; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22025; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22026; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22027; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22028; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22029; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22030; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22031; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22032; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22033; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22034; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22035; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
22036; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22037; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22038; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22039; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
22040;
22041; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22042; GFX940-TGSPLIT:       ; %bb.0: ; %entry
22043; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22044; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22045; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22046; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22047; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22048; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22049; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22050; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22051; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22052; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22053; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22054; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22055; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22056; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
22057; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22058; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22059; GFX940-TGSPLIT-NEXT:    s_endpgm
22060;
22061; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22062; GFX11-WGP:       ; %bb.0: ; %entry
22063; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22064; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22065; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22066; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22067; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
22068; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
22069; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22070; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
22071; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22072; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22073; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22074; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22075; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22076; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22077; GFX11-WGP-NEXT:    buffer_gl1_inv
22078; GFX11-WGP-NEXT:    buffer_gl0_inv
22079; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22080; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22081; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22082; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
22083; GFX11-WGP-NEXT:    s_endpgm
22084;
22085; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22086; GFX11-CU:       ; %bb.0: ; %entry
22087; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22088; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22089; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22090; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22091; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
22092; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
22093; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22094; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
22095; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22096; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22097; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22098; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22099; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22100; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22101; GFX11-CU-NEXT:    buffer_gl1_inv
22102; GFX11-CU-NEXT:    buffer_gl0_inv
22103; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22104; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22105; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22106; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
22107; GFX11-CU-NEXT:    s_endpgm
22108;
22109; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22110; GFX12-WGP:       ; %bb.0: ; %entry
22111; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22112; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22113; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22114; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
22115; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
22116; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
22117; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22118; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
22119; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22120; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22121; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
22122; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
22123; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
22124; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22125; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
22126; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22127; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22128; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
22129; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22130; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22131; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
22132; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
22133; GFX12-WGP-NEXT:    s_endpgm
22134;
22135; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
22136; GFX12-CU:       ; %bb.0: ; %entry
22137; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22138; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22139; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22140; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
22141; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
22142; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
22143; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22144; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
22145; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22146; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22147; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
22148; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
22149; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
22150; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22151; GFX12-CU-NEXT:    s_wait_storecnt 0x0
22152; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22153; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22154; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
22155; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22156; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22157; GFX12-CU-NEXT:    s_wait_dscnt 0x0
22158; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
22159; GFX12-CU-NEXT:    s_endpgm
22160    ptr %out, i32 %in, i32 %old) {
22161entry:
22162  %gep = getelementptr i32, ptr %out, i32 4
22163  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
22164  %val0 = extractvalue { i32, i1 } %val, 0
22165  store i32 %val0, ptr %out, align 4
22166  ret void
22167}
22168
22169define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
22170; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22171; GFX7:       ; %bb.0: ; %entry
22172; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
22173; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22174; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
22175; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
22176; GFX7-NEXT:    s_mov_b64 s[12:13], 16
22177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22178; GFX7-NEXT:    s_mov_b32 s6, s4
22179; GFX7-NEXT:    s_mov_b32 s7, s5
22180; GFX7-NEXT:    s_mov_b32 s11, s12
22181; GFX7-NEXT:    s_mov_b32 s10, s13
22182; GFX7-NEXT:    s_add_u32 s6, s6, s11
22183; GFX7-NEXT:    s_addc_u32 s10, s7, s10
22184; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22185; GFX7-NEXT:    s_mov_b32 s7, s10
22186; GFX7-NEXT:    v_mov_b32_e32 v2, s9
22187; GFX7-NEXT:    v_mov_b32_e32 v0, s8
22188; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22189; GFX7-NEXT:    v_mov_b32_e32 v3, v0
22190; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22191; GFX7-NEXT:    v_mov_b32_e32 v1, s7
22192; GFX7-NEXT:    s_waitcnt vmcnt(0)
22193; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22194; GFX7-NEXT:    s_waitcnt vmcnt(0)
22195; GFX7-NEXT:    buffer_wbinvl1_vol
22196; GFX7-NEXT:    v_mov_b32_e32 v0, s4
22197; GFX7-NEXT:    v_mov_b32_e32 v1, s5
22198; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22199; GFX7-NEXT:    flat_store_dword v[0:1], v2
22200; GFX7-NEXT:    s_endpgm
22201;
22202; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22203; GFX10-WGP:       ; %bb.0: ; %entry
22204; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
22205; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22206; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
22207; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
22208; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
22209; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22210; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
22211; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
22212; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
22213; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
22214; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
22215; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
22216; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22217; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
22218; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
22219; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
22220; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22221; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
22222; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
22223; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
22224; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22225; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22226; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22227; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22228; GFX10-WGP-NEXT:    buffer_gl1_inv
22229; GFX10-WGP-NEXT:    buffer_gl0_inv
22230; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
22231; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
22232; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22233; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
22234; GFX10-WGP-NEXT:    s_endpgm
22235;
22236; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22237; GFX10-CU:       ; %bb.0: ; %entry
22238; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
22239; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22240; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
22241; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
22242; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
22243; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22244; GFX10-CU-NEXT:    s_mov_b32 s6, s4
22245; GFX10-CU-NEXT:    s_mov_b32 s7, s5
22246; GFX10-CU-NEXT:    s_mov_b32 s11, s12
22247; GFX10-CU-NEXT:    s_mov_b32 s10, s13
22248; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
22249; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
22250; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22251; GFX10-CU-NEXT:    s_mov_b32 s7, s10
22252; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
22253; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
22254; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22255; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
22256; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
22257; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
22258; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22259; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22260; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22261; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22262; GFX10-CU-NEXT:    buffer_gl1_inv
22263; GFX10-CU-NEXT:    buffer_gl0_inv
22264; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
22265; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
22266; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22267; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
22268; GFX10-CU-NEXT:    s_endpgm
22269;
22270; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22271; SKIP-CACHE-INV:       ; %bb.0: ; %entry
22272; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
22273; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
22274; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
22275; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
22276; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
22277; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22278; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
22279; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
22280; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
22281; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
22282; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
22283; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
22284; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
22285; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
22286; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
22287; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
22288; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22289; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
22290; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
22291; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
22292; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22293; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22294; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22295; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
22296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
22297; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22298; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
22299; SKIP-CACHE-INV-NEXT:    s_endpgm
22300;
22301; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22302; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
22303; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22304; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22305; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22306; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22307; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22308; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22309; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22310; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22311; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22312; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
22313; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22314; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22315; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22316; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
22317; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
22318; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22319; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22320; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22321; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
22322;
22323; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22324; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
22325; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22326; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22327; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22328; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22329; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22330; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22331; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22333; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22334; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
22335; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22336; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22337; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22338; GFX90A-TGSPLIT-NEXT:    buffer_invl2
22339; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
22340; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22341; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22342; GFX90A-TGSPLIT-NEXT:    s_endpgm
22343;
22344; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22345; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
22346; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22347; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22348; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22349; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22350; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22351; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22352; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22353; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22354; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22355; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22356; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22357; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22358; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22359; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
22360; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22361; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22362; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22363; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
22364;
22365; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22366; GFX940-TGSPLIT:       ; %bb.0: ; %entry
22367; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22368; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22369; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22370; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22371; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22372; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22373; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22374; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22375; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22376; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22377; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22378; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22379; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22380; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
22381; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22382; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22383; GFX940-TGSPLIT-NEXT:    s_endpgm
22384;
22385; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22386; GFX11-WGP:       ; %bb.0: ; %entry
22387; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22388; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22389; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22390; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22391; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
22392; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
22393; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22394; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
22395; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22396; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22397; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22398; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22399; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22400; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22401; GFX11-WGP-NEXT:    buffer_gl1_inv
22402; GFX11-WGP-NEXT:    buffer_gl0_inv
22403; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22404; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22405; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22406; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
22407; GFX11-WGP-NEXT:    s_endpgm
22408;
22409; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22410; GFX11-CU:       ; %bb.0: ; %entry
22411; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22412; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22413; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22414; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22415; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
22416; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
22417; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22418; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
22419; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22420; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22421; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22422; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22423; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22424; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22425; GFX11-CU-NEXT:    buffer_gl1_inv
22426; GFX11-CU-NEXT:    buffer_gl0_inv
22427; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22428; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22429; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22430; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
22431; GFX11-CU-NEXT:    s_endpgm
22432;
22433; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22434; GFX12-WGP:       ; %bb.0: ; %entry
22435; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22436; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22437; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22438; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
22439; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
22440; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
22441; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22442; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
22443; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22444; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22445; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
22446; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
22447; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
22448; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22449; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
22450; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22451; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
22452; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
22453; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22454; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
22455; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22456; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22457; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
22458; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
22459; GFX12-WGP-NEXT:    s_endpgm
22460;
22461; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
22462; GFX12-CU:       ; %bb.0: ; %entry
22463; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22464; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22465; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22466; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
22467; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
22468; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
22469; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22470; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
22471; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22472; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22473; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
22474; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
22475; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
22476; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22477; GFX12-CU-NEXT:    s_wait_storecnt 0x0
22478; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22479; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
22480; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
22481; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22482; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
22483; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22484; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22485; GFX12-CU-NEXT:    s_wait_dscnt 0x0
22486; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
22487; GFX12-CU-NEXT:    s_endpgm
22488    ptr %out, i32 %in, i32 %old) {
22489entry:
22490  %gep = getelementptr i32, ptr %out, i32 4
22491  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
22492  %val0 = extractvalue { i32, i1 } %val, 0
22493  store i32 %val0, ptr %out, align 4
22494  ret void
22495}
22496
22497define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
22498; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22499; GFX7:       ; %bb.0: ; %entry
22500; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
22501; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22502; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
22503; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
22504; GFX7-NEXT:    s_mov_b64 s[12:13], 16
22505; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22506; GFX7-NEXT:    s_mov_b32 s6, s4
22507; GFX7-NEXT:    s_mov_b32 s7, s5
22508; GFX7-NEXT:    s_mov_b32 s11, s12
22509; GFX7-NEXT:    s_mov_b32 s10, s13
22510; GFX7-NEXT:    s_add_u32 s6, s6, s11
22511; GFX7-NEXT:    s_addc_u32 s10, s7, s10
22512; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22513; GFX7-NEXT:    s_mov_b32 s7, s10
22514; GFX7-NEXT:    v_mov_b32_e32 v2, s9
22515; GFX7-NEXT:    v_mov_b32_e32 v0, s8
22516; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22517; GFX7-NEXT:    v_mov_b32_e32 v3, v0
22518; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22519; GFX7-NEXT:    v_mov_b32_e32 v1, s7
22520; GFX7-NEXT:    s_waitcnt vmcnt(0)
22521; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22522; GFX7-NEXT:    s_waitcnt vmcnt(0)
22523; GFX7-NEXT:    buffer_wbinvl1_vol
22524; GFX7-NEXT:    v_mov_b32_e32 v0, s4
22525; GFX7-NEXT:    v_mov_b32_e32 v1, s5
22526; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22527; GFX7-NEXT:    flat_store_dword v[0:1], v2
22528; GFX7-NEXT:    s_endpgm
22529;
22530; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22531; GFX10-WGP:       ; %bb.0: ; %entry
22532; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
22533; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22534; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
22535; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
22536; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
22537; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22538; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
22539; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
22540; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
22541; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
22542; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
22543; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
22544; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22545; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
22546; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
22547; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
22548; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22549; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
22550; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
22551; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
22552; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22553; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22554; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22555; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22556; GFX10-WGP-NEXT:    buffer_gl1_inv
22557; GFX10-WGP-NEXT:    buffer_gl0_inv
22558; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
22559; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
22560; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22561; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
22562; GFX10-WGP-NEXT:    s_endpgm
22563;
22564; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22565; GFX10-CU:       ; %bb.0: ; %entry
22566; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
22567; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22568; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
22569; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
22570; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
22571; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22572; GFX10-CU-NEXT:    s_mov_b32 s6, s4
22573; GFX10-CU-NEXT:    s_mov_b32 s7, s5
22574; GFX10-CU-NEXT:    s_mov_b32 s11, s12
22575; GFX10-CU-NEXT:    s_mov_b32 s10, s13
22576; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
22577; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
22578; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22579; GFX10-CU-NEXT:    s_mov_b32 s7, s10
22580; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
22581; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
22582; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22583; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
22584; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
22585; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
22586; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22587; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22588; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22589; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22590; GFX10-CU-NEXT:    buffer_gl1_inv
22591; GFX10-CU-NEXT:    buffer_gl0_inv
22592; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
22593; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
22594; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22595; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
22596; GFX10-CU-NEXT:    s_endpgm
22597;
22598; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22599; SKIP-CACHE-INV:       ; %bb.0: ; %entry
22600; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
22601; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
22602; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
22603; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
22604; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
22605; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22606; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
22607; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
22608; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
22609; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
22610; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
22611; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
22612; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
22613; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
22614; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
22615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
22616; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
22618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
22619; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
22620; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22621; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22622; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22623; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
22624; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
22625; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22626; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
22627; SKIP-CACHE-INV-NEXT:    s_endpgm
22628;
22629; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22630; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
22631; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22632; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22633; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22634; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22635; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22636; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22637; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22638; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22639; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22640; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
22641; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22642; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22643; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22644; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
22645; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
22646; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22647; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22648; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22649; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
22650;
22651; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22652; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
22653; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22654; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22655; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22656; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22657; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22658; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22659; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22660; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22661; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22662; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
22663; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22664; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22665; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22666; GFX90A-TGSPLIT-NEXT:    buffer_invl2
22667; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
22668; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22669; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22670; GFX90A-TGSPLIT-NEXT:    s_endpgm
22671;
22672; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22673; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
22674; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22675; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22676; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22677; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22678; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22679; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22680; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22682; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22683; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22684; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22685; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22686; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22687; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
22688; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22689; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22690; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22691; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
22692;
22693; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22694; GFX940-TGSPLIT:       ; %bb.0: ; %entry
22695; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
22696; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
22697; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
22698; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22699; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
22700; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
22701; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22702; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22703; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22704; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
22705; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22706; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
22707; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22708; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
22709; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
22710; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
22711; GFX940-TGSPLIT-NEXT:    s_endpgm
22712;
22713; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22714; GFX11-WGP:       ; %bb.0: ; %entry
22715; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22716; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22717; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22718; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22719; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
22720; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
22721; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22722; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
22723; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22724; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22725; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22726; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22727; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22728; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
22729; GFX11-WGP-NEXT:    buffer_gl1_inv
22730; GFX11-WGP-NEXT:    buffer_gl0_inv
22731; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
22732; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
22733; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22734; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
22735; GFX11-WGP-NEXT:    s_endpgm
22736;
22737; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22738; GFX11-CU:       ; %bb.0: ; %entry
22739; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22740; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22741; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22742; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22743; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
22744; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
22745; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22746; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
22747; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22748; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22749; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22750; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22751; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
22752; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
22753; GFX11-CU-NEXT:    buffer_gl1_inv
22754; GFX11-CU-NEXT:    buffer_gl0_inv
22755; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
22756; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
22757; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
22758; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
22759; GFX11-CU-NEXT:    s_endpgm
22760;
22761; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22762; GFX12-WGP:       ; %bb.0: ; %entry
22763; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22764; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
22765; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
22766; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
22767; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
22768; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
22769; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22770; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
22771; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22772; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22773; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
22774; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
22775; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
22776; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22777; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
22778; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22779; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
22780; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
22781; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
22782; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
22783; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
22784; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
22785; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
22786; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
22787; GFX12-WGP-NEXT:    s_endpgm
22788;
22789; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
22790; GFX12-CU:       ; %bb.0: ; %entry
22791; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
22792; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
22793; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
22794; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
22795; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
22796; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
22797; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22798; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
22799; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22800; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22801; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
22802; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
22803; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
22804; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22805; GFX12-CU-NEXT:    s_wait_storecnt 0x0
22806; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
22807; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
22808; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
22809; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
22810; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
22811; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
22812; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
22813; GFX12-CU-NEXT:    s_wait_dscnt 0x0
22814; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
22815; GFX12-CU-NEXT:    s_endpgm
22816    ptr %out, i32 %in, i32 %old) {
22817entry:
22818  %gep = getelementptr i32, ptr %out, i32 4
22819  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
22820  %val0 = extractvalue { i32, i1 } %val, 0
22821  store i32 %val0, ptr %out, align 4
22822  ret void
22823}
22824
22825define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
22826; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22827; GFX7:       ; %bb.0: ; %entry
22828; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
22829; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22830; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
22831; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
22832; GFX7-NEXT:    s_mov_b64 s[12:13], 16
22833; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22834; GFX7-NEXT:    s_mov_b32 s6, s4
22835; GFX7-NEXT:    s_mov_b32 s7, s5
22836; GFX7-NEXT:    s_mov_b32 s11, s12
22837; GFX7-NEXT:    s_mov_b32 s10, s13
22838; GFX7-NEXT:    s_add_u32 s6, s6, s11
22839; GFX7-NEXT:    s_addc_u32 s10, s7, s10
22840; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22841; GFX7-NEXT:    s_mov_b32 s7, s10
22842; GFX7-NEXT:    v_mov_b32_e32 v2, s9
22843; GFX7-NEXT:    v_mov_b32_e32 v0, s8
22844; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22845; GFX7-NEXT:    v_mov_b32_e32 v3, v0
22846; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22847; GFX7-NEXT:    v_mov_b32_e32 v1, s7
22848; GFX7-NEXT:    s_waitcnt vmcnt(0)
22849; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22850; GFX7-NEXT:    s_waitcnt vmcnt(0)
22851; GFX7-NEXT:    buffer_wbinvl1_vol
22852; GFX7-NEXT:    v_mov_b32_e32 v0, s4
22853; GFX7-NEXT:    v_mov_b32_e32 v1, s5
22854; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22855; GFX7-NEXT:    flat_store_dword v[0:1], v2
22856; GFX7-NEXT:    s_endpgm
22857;
22858; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22859; GFX10-WGP:       ; %bb.0: ; %entry
22860; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
22861; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22862; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
22863; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
22864; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
22865; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22866; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
22867; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
22868; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
22869; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
22870; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
22871; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
22872; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22873; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
22874; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
22875; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
22876; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22877; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
22878; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
22879; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
22880; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22881; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
22882; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22883; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
22884; GFX10-WGP-NEXT:    buffer_gl1_inv
22885; GFX10-WGP-NEXT:    buffer_gl0_inv
22886; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
22887; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
22888; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
22889; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
22890; GFX10-WGP-NEXT:    s_endpgm
22891;
22892; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22893; GFX10-CU:       ; %bb.0: ; %entry
22894; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
22895; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
22896; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
22897; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
22898; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
22899; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22900; GFX10-CU-NEXT:    s_mov_b32 s6, s4
22901; GFX10-CU-NEXT:    s_mov_b32 s7, s5
22902; GFX10-CU-NEXT:    s_mov_b32 s11, s12
22903; GFX10-CU-NEXT:    s_mov_b32 s10, s13
22904; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
22905; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
22906; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
22907; GFX10-CU-NEXT:    s_mov_b32 s7, s10
22908; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
22909; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
22910; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22911; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
22912; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
22913; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
22914; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22915; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
22916; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22917; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
22918; GFX10-CU-NEXT:    buffer_gl1_inv
22919; GFX10-CU-NEXT:    buffer_gl0_inv
22920; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
22921; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
22922; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
22923; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
22924; GFX10-CU-NEXT:    s_endpgm
22925;
22926; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22927; SKIP-CACHE-INV:       ; %bb.0: ; %entry
22928; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
22929; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
22930; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
22931; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
22932; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
22933; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22934; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
22935; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
22936; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
22937; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
22938; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
22939; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
22940; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
22941; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
22942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
22943; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
22944; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22945; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
22946; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
22947; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
22948; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22949; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
22950; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
22951; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
22952; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
22953; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
22954; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
22955; SKIP-CACHE-INV-NEXT:    s_endpgm
22956;
22957; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22958; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
22959; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22960; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22961; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22962; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22963; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22964; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22965; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22966; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22967; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22968; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
22969; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22970; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22971; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22972; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
22973; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
22974; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22975; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22976; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22977; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
22978;
22979; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
22980; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
22981; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
22982; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
22983; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
22984; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
22985; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
22986; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
22987; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
22988; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
22989; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22990; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
22991; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22992; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
22993; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
22994; GFX90A-TGSPLIT-NEXT:    buffer_invl2
22995; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
22996; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
22997; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
22998; GFX90A-TGSPLIT-NEXT:    s_endpgm
22999;
23000; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23001; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
23002; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
23003; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
23004; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
23005; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
23006; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
23007; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
23008; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23009; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
23010; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
23011; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
23012; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
23013; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
23014; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
23015; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
23016; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
23017; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
23018; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
23019; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
23020;
23021; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23022; GFX940-TGSPLIT:       ; %bb.0: ; %entry
23023; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
23024; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
23025; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
23026; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
23027; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
23028; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
23029; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23030; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
23031; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
23032; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
23033; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
23034; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
23035; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
23036; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
23037; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
23038; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
23039; GFX940-TGSPLIT-NEXT:    s_endpgm
23040;
23041; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23042; GFX11-WGP:       ; %bb.0: ; %entry
23043; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
23044; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
23045; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
23046; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
23047; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
23048; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
23049; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23050; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
23051; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
23052; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
23053; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
23054; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
23055; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
23056; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
23057; GFX11-WGP-NEXT:    buffer_gl1_inv
23058; GFX11-WGP-NEXT:    buffer_gl0_inv
23059; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
23060; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
23061; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
23062; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
23063; GFX11-WGP-NEXT:    s_endpgm
23064;
23065; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23066; GFX11-CU:       ; %bb.0: ; %entry
23067; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
23068; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
23069; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
23070; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
23071; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
23072; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
23073; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23074; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
23075; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
23076; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
23077; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
23078; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
23079; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
23080; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
23081; GFX11-CU-NEXT:    buffer_gl1_inv
23082; GFX11-CU-NEXT:    buffer_gl0_inv
23083; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
23084; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
23085; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
23086; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
23087; GFX11-CU-NEXT:    s_endpgm
23088;
23089; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23090; GFX12-WGP:       ; %bb.0: ; %entry
23091; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
23092; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
23093; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
23094; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
23095; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
23096; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
23097; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23098; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
23099; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
23100; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
23101; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
23102; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
23103; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
23104; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
23105; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
23106; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
23107; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
23108; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
23109; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
23110; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
23111; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
23112; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
23113; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
23114; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
23115; GFX12-WGP-NEXT:    s_endpgm
23116;
23117; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
23118; GFX12-CU:       ; %bb.0: ; %entry
23119; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
23120; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
23121; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
23122; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
23123; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
23124; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
23125; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
23126; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
23127; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
23128; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
23129; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
23130; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
23131; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
23132; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
23133; GFX12-CU-NEXT:    s_wait_storecnt 0x0
23134; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
23135; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
23136; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
23137; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
23138; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
23139; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
23140; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
23141; GFX12-CU-NEXT:    s_wait_dscnt 0x0
23142; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
23143; GFX12-CU-NEXT:    s_endpgm
23144    ptr %out, i32 %in, i32 %old) {
23145entry:
23146  %gep = getelementptr i32, ptr %out, i32 4
23147  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
23148  %val0 = extractvalue { i32, i1 } %val, 0
23149  store i32 %val0, ptr %out, align 4
23150  ret void
23151}
23152