xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
14
15define amdgpu_kernel void @flat_wavefront_unordered_load(
16; GFX7-LABEL: flat_wavefront_unordered_load:
17; GFX7:       ; %bb.0: ; %entry
18; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
19; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    v_mov_b32_e32 v0, s6
22; GFX7-NEXT:    v_mov_b32_e32 v1, s7
23; GFX7-NEXT:    flat_load_dword v2, v[0:1]
24; GFX7-NEXT:    v_mov_b32_e32 v0, s4
25; GFX7-NEXT:    v_mov_b32_e32 v1, s5
26; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
27; GFX7-NEXT:    flat_store_dword v[0:1], v2
28; GFX7-NEXT:    s_endpgm
29;
30; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
31; GFX10-WGP:       ; %bb.0: ; %entry
32; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
33; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
34; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
36; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
37; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
38; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
39; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
40; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
41; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
42; GFX10-WGP-NEXT:    s_endpgm
43;
44; GFX10-CU-LABEL: flat_wavefront_unordered_load:
45; GFX10-CU:       ; %bb.0: ; %entry
46; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
47; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
48; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
50; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
51; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
53; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
54; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
55; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
56; GFX10-CU-NEXT:    s_endpgm
57;
58; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load:
59; SKIP-CACHE-INV:       ; %bb.0: ; %entry
60; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
64; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
65; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
66; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
67; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
68; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
69; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
70; SKIP-CACHE-INV-NEXT:    s_endpgm
71;
72; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
73; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
74; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
75; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
76; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
78; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
79; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
80; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
81; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
82; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
83;
84; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
85; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
86; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
87; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
88; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
90; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
91; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
92; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
93; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
94; GFX90A-TGSPLIT-NEXT:    s_endpgm
95;
96; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
97; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
98; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
99; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
100; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
102; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
103; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
104; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
105; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
106; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
107;
108; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_load:
109; GFX940-TGSPLIT:       ; %bb.0: ; %entry
110; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
111; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
112; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
114; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
115; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
116; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
117; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
118; GFX940-TGSPLIT-NEXT:    s_endpgm
119;
120; GFX11-WGP-LABEL: flat_wavefront_unordered_load:
121; GFX11-WGP:       ; %bb.0: ; %entry
122; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
123; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
124; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
126; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
127; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
128; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
129; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
130; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
132; GFX11-WGP-NEXT:    s_endpgm
133;
134; GFX11-CU-LABEL: flat_wavefront_unordered_load:
135; GFX11-CU:       ; %bb.0: ; %entry
136; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
137; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
138; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
140; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
141; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
142; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
143; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
144; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
145; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
146; GFX11-CU-NEXT:    s_endpgm
147;
148; GFX12-WGP-LABEL: flat_wavefront_unordered_load:
149; GFX12-WGP:       ; %bb.0: ; %entry
150; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
151; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
152; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
153; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
154; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
155; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
156; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
157; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
158; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
159; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
160; GFX12-WGP-NEXT:    s_endpgm
161;
162; GFX12-CU-LABEL: flat_wavefront_unordered_load:
163; GFX12-CU:       ; %bb.0: ; %entry
164; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
165; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
166; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
167; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
168; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
169; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
170; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
171; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
172; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
173; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
174; GFX12-CU-NEXT:    s_endpgm
175    ptr %in, ptr %out) {
176entry:
177  %val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4
178  store i32 %val, ptr %out
179  ret void
180}
181
182define amdgpu_kernel void @flat_wavefront_monotonic_load(
183; GFX7-LABEL: flat_wavefront_monotonic_load:
184; GFX7:       ; %bb.0: ; %entry
185; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
186; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
187; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX7-NEXT:    v_mov_b32_e32 v0, s6
189; GFX7-NEXT:    v_mov_b32_e32 v1, s7
190; GFX7-NEXT:    flat_load_dword v2, v[0:1]
191; GFX7-NEXT:    v_mov_b32_e32 v0, s4
192; GFX7-NEXT:    v_mov_b32_e32 v1, s5
193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
194; GFX7-NEXT:    flat_store_dword v[0:1], v2
195; GFX7-NEXT:    s_endpgm
196;
197; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
198; GFX10-WGP:       ; %bb.0: ; %entry
199; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
200; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
201; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
202; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
203; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
204; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
205; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
206; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
207; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
208; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
209; GFX10-WGP-NEXT:    s_endpgm
210;
211; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
212; GFX10-CU:       ; %bb.0: ; %entry
213; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
214; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
215; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
217; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
218; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
219; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
220; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
221; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
222; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
223; GFX10-CU-NEXT:    s_endpgm
224;
225; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load:
226; SKIP-CACHE-INV:       ; %bb.0: ; %entry
227; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
229; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
231; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
232; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
235; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
236; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
237; SKIP-CACHE-INV-NEXT:    s_endpgm
238;
239; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
240; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
241; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
242; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
243; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
245; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
246; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
247; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
248; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
249; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
250;
251; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
252; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
253; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
254; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
257; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
258; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
259; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
260; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
261; GFX90A-TGSPLIT-NEXT:    s_endpgm
262;
263; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
264; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
265; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
266; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
267; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
269; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
271; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
273; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
274;
275; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
276; GFX940-TGSPLIT:       ; %bb.0: ; %entry
277; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
278; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
279; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
281; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
282; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
283; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
284; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
285; GFX940-TGSPLIT-NEXT:    s_endpgm
286;
287; GFX11-WGP-LABEL: flat_wavefront_monotonic_load:
288; GFX11-WGP:       ; %bb.0: ; %entry
289; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
290; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
291; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
293; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
294; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
295; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
296; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
297; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
299; GFX11-WGP-NEXT:    s_endpgm
300;
301; GFX11-CU-LABEL: flat_wavefront_monotonic_load:
302; GFX11-CU:       ; %bb.0: ; %entry
303; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
304; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
305; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
307; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
308; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
309; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
310; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
311; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
312; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
313; GFX11-CU-NEXT:    s_endpgm
314;
315; GFX12-WGP-LABEL: flat_wavefront_monotonic_load:
316; GFX12-WGP:       ; %bb.0: ; %entry
317; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
318; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
319; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
320; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
321; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
322; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
323; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
324; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
325; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
326; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
327; GFX12-WGP-NEXT:    s_endpgm
328;
329; GFX12-CU-LABEL: flat_wavefront_monotonic_load:
330; GFX12-CU:       ; %bb.0: ; %entry
331; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
332; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
333; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
334; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
335; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
336; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
337; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
338; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
339; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
340; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
341; GFX12-CU-NEXT:    s_endpgm
342    ptr %in, ptr %out) {
343entry:
344  %val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4
345  store i32 %val, ptr %out
346  ret void
347}
348
349define amdgpu_kernel void @flat_wavefront_acquire_load(
350; GFX7-LABEL: flat_wavefront_acquire_load:
351; GFX7:       ; %bb.0: ; %entry
352; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
353; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    v_mov_b32_e32 v0, s6
356; GFX7-NEXT:    v_mov_b32_e32 v1, s7
357; GFX7-NEXT:    flat_load_dword v2, v[0:1]
358; GFX7-NEXT:    v_mov_b32_e32 v0, s4
359; GFX7-NEXT:    v_mov_b32_e32 v1, s5
360; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
361; GFX7-NEXT:    flat_store_dword v[0:1], v2
362; GFX7-NEXT:    s_endpgm
363;
364; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
365; GFX10-WGP:       ; %bb.0: ; %entry
366; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
367; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
368; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
370; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
371; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
372; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
373; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
374; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
375; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
376; GFX10-WGP-NEXT:    s_endpgm
377;
378; GFX10-CU-LABEL: flat_wavefront_acquire_load:
379; GFX10-CU:       ; %bb.0: ; %entry
380; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
381; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
382; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
384; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
385; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
388; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
389; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
390; GFX10-CU-NEXT:    s_endpgm
391;
392; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load:
393; SKIP-CACHE-INV:       ; %bb.0: ; %entry
394; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
395; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
396; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
397; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
398; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
399; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
401; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
402; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
403; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
404; SKIP-CACHE-INV-NEXT:    s_endpgm
405;
406; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
407; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
408; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
409; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
410; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
412; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
413; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
414; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
415; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
416; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
417;
418; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
419; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
420; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
421; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
422; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
424; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
425; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
426; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
427; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
428; GFX90A-TGSPLIT-NEXT:    s_endpgm
429;
430; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
431; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
432; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
433; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
434; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
436; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
437; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
438; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
439; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
440; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
441;
442; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_load:
443; GFX940-TGSPLIT:       ; %bb.0: ; %entry
444; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
445; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
446; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
448; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
449; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
450; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
451; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
452; GFX940-TGSPLIT-NEXT:    s_endpgm
453;
454; GFX11-WGP-LABEL: flat_wavefront_acquire_load:
455; GFX11-WGP:       ; %bb.0: ; %entry
456; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
457; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
458; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
460; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
461; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
462; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
463; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
464; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
465; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
466; GFX11-WGP-NEXT:    s_endpgm
467;
468; GFX11-CU-LABEL: flat_wavefront_acquire_load:
469; GFX11-CU:       ; %bb.0: ; %entry
470; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
471; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
472; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
474; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
475; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
476; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
477; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
478; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
479; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
480; GFX11-CU-NEXT:    s_endpgm
481;
482; GFX12-WGP-LABEL: flat_wavefront_acquire_load:
483; GFX12-WGP:       ; %bb.0: ; %entry
484; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
485; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
486; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
487; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
488; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
489; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
490; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
491; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
492; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
493; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
494; GFX12-WGP-NEXT:    s_endpgm
495;
496; GFX12-CU-LABEL: flat_wavefront_acquire_load:
497; GFX12-CU:       ; %bb.0: ; %entry
498; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
499; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
500; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
501; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
502; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
503; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
504; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
505; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
506; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
507; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
508; GFX12-CU-NEXT:    s_endpgm
509    ptr %in, ptr %out) {
510entry:
511  %val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4
512  store i32 %val, ptr %out
513  ret void
514}
515
516define amdgpu_kernel void @flat_wavefront_seq_cst_load(
517; GFX7-LABEL: flat_wavefront_seq_cst_load:
518; GFX7:       ; %bb.0: ; %entry
519; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
520; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
521; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX7-NEXT:    v_mov_b32_e32 v0, s6
523; GFX7-NEXT:    v_mov_b32_e32 v1, s7
524; GFX7-NEXT:    flat_load_dword v2, v[0:1]
525; GFX7-NEXT:    v_mov_b32_e32 v0, s4
526; GFX7-NEXT:    v_mov_b32_e32 v1, s5
527; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
528; GFX7-NEXT:    flat_store_dword v[0:1], v2
529; GFX7-NEXT:    s_endpgm
530;
531; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
532; GFX10-WGP:       ; %bb.0: ; %entry
533; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
534; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
535; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
537; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
538; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
539; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
540; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
541; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
542; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
543; GFX10-WGP-NEXT:    s_endpgm
544;
545; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
546; GFX10-CU:       ; %bb.0: ; %entry
547; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
548; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
549; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
551; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
552; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
553; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
554; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
555; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
556; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
557; GFX10-CU-NEXT:    s_endpgm
558;
559; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load:
560; SKIP-CACHE-INV:       ; %bb.0: ; %entry
561; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
562; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
563; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
566; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
569; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
571; SKIP-CACHE-INV-NEXT:    s_endpgm
572;
573; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
574; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
575; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
576; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
577; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
579; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
580; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
581; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
582; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
583; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
584;
585; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
586; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
587; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
588; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
589; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
591; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
592; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
593; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
594; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
595; GFX90A-TGSPLIT-NEXT:    s_endpgm
596;
597; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
598; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
599; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
600; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
601; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
603; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
604; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
605; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
606; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
607; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
608;
609; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
610; GFX940-TGSPLIT:       ; %bb.0: ; %entry
611; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
612; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
613; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
615; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
616; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
617; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
618; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
619; GFX940-TGSPLIT-NEXT:    s_endpgm
620;
621; GFX11-WGP-LABEL: flat_wavefront_seq_cst_load:
622; GFX11-WGP:       ; %bb.0: ; %entry
623; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
624; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
625; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
626; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
627; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
628; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
629; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
630; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
631; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
632; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
633; GFX11-WGP-NEXT:    s_endpgm
634;
635; GFX11-CU-LABEL: flat_wavefront_seq_cst_load:
636; GFX11-CU:       ; %bb.0: ; %entry
637; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
638; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
639; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
641; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
642; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
643; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
644; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
645; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
646; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
647; GFX11-CU-NEXT:    s_endpgm
648;
649; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load:
650; GFX12-WGP:       ; %bb.0: ; %entry
651; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
652; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
653; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
654; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
655; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
656; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
657; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
658; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
659; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
660; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
661; GFX12-WGP-NEXT:    s_endpgm
662;
663; GFX12-CU-LABEL: flat_wavefront_seq_cst_load:
664; GFX12-CU:       ; %bb.0: ; %entry
665; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
666; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
667; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
668; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
669; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
670; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
671; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
672; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
673; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
674; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
675; GFX12-CU-NEXT:    s_endpgm
676    ptr %in, ptr %out) {
677entry:
678  %val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4
679  store i32 %val, ptr %out
680  ret void
681}
682
683define amdgpu_kernel void @flat_wavefront_unordered_store(
684; GFX7-LABEL: flat_wavefront_unordered_store:
685; GFX7:       ; %bb.0: ; %entry
686; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
687; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
688; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX7-NEXT:    v_mov_b32_e32 v0, s6
690; GFX7-NEXT:    v_mov_b32_e32 v1, s7
691; GFX7-NEXT:    v_mov_b32_e32 v2, s4
692; GFX7-NEXT:    flat_store_dword v[0:1], v2
693; GFX7-NEXT:    s_endpgm
694;
695; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
696; GFX10-WGP:       ; %bb.0: ; %entry
697; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
698; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
699; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
701; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
702; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
703; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
704; GFX10-WGP-NEXT:    s_endpgm
705;
706; GFX10-CU-LABEL: flat_wavefront_unordered_store:
707; GFX10-CU:       ; %bb.0: ; %entry
708; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
709; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
710; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
712; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
713; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
714; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
715; GFX10-CU-NEXT:    s_endpgm
716;
717; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store:
718; SKIP-CACHE-INV:       ; %bb.0: ; %entry
719; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
720; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
721; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
722; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
723; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
724; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
725; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
726; SKIP-CACHE-INV-NEXT:    s_endpgm
727;
728; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
729; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
730; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
731; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
732; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
734; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
735; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
736; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
737;
738; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
739; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
740; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
741; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
742; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
744; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
745; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
746; GFX90A-TGSPLIT-NEXT:    s_endpgm
747;
748; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
749; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
750; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
751; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
752; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
754; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
755; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
756; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
757;
758; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_store:
759; GFX940-TGSPLIT:       ; %bb.0: ; %entry
760; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
761; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
762; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
764; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
765; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
766; GFX940-TGSPLIT-NEXT:    s_endpgm
767;
768; GFX11-WGP-LABEL: flat_wavefront_unordered_store:
769; GFX11-WGP:       ; %bb.0: ; %entry
770; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
771; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
772; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
774; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
775; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
776; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
777; GFX11-WGP-NEXT:    s_endpgm
778;
779; GFX11-CU-LABEL: flat_wavefront_unordered_store:
780; GFX11-CU:       ; %bb.0: ; %entry
781; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
782; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
783; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
785; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
786; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
787; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
788; GFX11-CU-NEXT:    s_endpgm
789;
790; GFX12-WGP-LABEL: flat_wavefront_unordered_store:
791; GFX12-WGP:       ; %bb.0: ; %entry
792; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
793; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
794; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
795; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
796; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
797; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
798; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
799; GFX12-WGP-NEXT:    s_endpgm
800;
801; GFX12-CU-LABEL: flat_wavefront_unordered_store:
802; GFX12-CU:       ; %bb.0: ; %entry
803; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
804; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
805; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
806; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
807; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
808; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
809; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
810; GFX12-CU-NEXT:    s_endpgm
811    i32 %in, ptr %out) {
812entry:
813  store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
814  ret void
815}
816
817define amdgpu_kernel void @flat_wavefront_monotonic_store(
818; GFX7-LABEL: flat_wavefront_monotonic_store:
819; GFX7:       ; %bb.0: ; %entry
820; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
821; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
822; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX7-NEXT:    v_mov_b32_e32 v0, s6
824; GFX7-NEXT:    v_mov_b32_e32 v1, s7
825; GFX7-NEXT:    v_mov_b32_e32 v2, s4
826; GFX7-NEXT:    flat_store_dword v[0:1], v2
827; GFX7-NEXT:    s_endpgm
828;
829; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
830; GFX10-WGP:       ; %bb.0: ; %entry
831; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
832; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
833; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
835; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
836; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
837; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
838; GFX10-WGP-NEXT:    s_endpgm
839;
840; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
841; GFX10-CU:       ; %bb.0: ; %entry
842; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
843; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
844; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
845; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
846; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
847; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
848; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
849; GFX10-CU-NEXT:    s_endpgm
850;
851; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store:
852; SKIP-CACHE-INV:       ; %bb.0: ; %entry
853; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
854; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
855; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
856; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
857; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
859; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
860; SKIP-CACHE-INV-NEXT:    s_endpgm
861;
862; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
863; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
864; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
865; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
866; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
868; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
869; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
870; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
871;
872; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
873; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
874; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
875; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
876; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
878; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
879; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
880; GFX90A-TGSPLIT-NEXT:    s_endpgm
881;
882; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
883; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
884; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
885; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
886; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
888; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
889; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
890; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
891;
892; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
893; GFX940-TGSPLIT:       ; %bb.0: ; %entry
894; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
895; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
896; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
897; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
898; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
899; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
900; GFX940-TGSPLIT-NEXT:    s_endpgm
901;
902; GFX11-WGP-LABEL: flat_wavefront_monotonic_store:
903; GFX11-WGP:       ; %bb.0: ; %entry
904; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
905; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
906; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
908; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
909; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
910; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
911; GFX11-WGP-NEXT:    s_endpgm
912;
913; GFX11-CU-LABEL: flat_wavefront_monotonic_store:
914; GFX11-CU:       ; %bb.0: ; %entry
915; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
916; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
917; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
919; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
920; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
921; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
922; GFX11-CU-NEXT:    s_endpgm
923;
924; GFX12-WGP-LABEL: flat_wavefront_monotonic_store:
925; GFX12-WGP:       ; %bb.0: ; %entry
926; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
927; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
928; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
929; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
930; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
931; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
932; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
933; GFX12-WGP-NEXT:    s_endpgm
934;
935; GFX12-CU-LABEL: flat_wavefront_monotonic_store:
936; GFX12-CU:       ; %bb.0: ; %entry
937; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
938; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
939; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
940; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
941; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
942; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
943; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
944; GFX12-CU-NEXT:    s_endpgm
945    i32 %in, ptr %out) {
946entry:
947  store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
948  ret void
949}
950
951define amdgpu_kernel void @flat_wavefront_release_store(
952; GFX7-LABEL: flat_wavefront_release_store:
953; GFX7:       ; %bb.0: ; %entry
954; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
955; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
956; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
957; GFX7-NEXT:    v_mov_b32_e32 v0, s6
958; GFX7-NEXT:    v_mov_b32_e32 v1, s7
959; GFX7-NEXT:    v_mov_b32_e32 v2, s4
960; GFX7-NEXT:    flat_store_dword v[0:1], v2
961; GFX7-NEXT:    s_endpgm
962;
963; GFX10-WGP-LABEL: flat_wavefront_release_store:
964; GFX10-WGP:       ; %bb.0: ; %entry
965; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
966; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
967; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
969; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
970; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
971; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
972; GFX10-WGP-NEXT:    s_endpgm
973;
974; GFX10-CU-LABEL: flat_wavefront_release_store:
975; GFX10-CU:       ; %bb.0: ; %entry
976; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
977; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
978; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
980; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
981; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
982; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
983; GFX10-CU-NEXT:    s_endpgm
984;
985; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store:
986; SKIP-CACHE-INV:       ; %bb.0: ; %entry
987; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
988; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
989; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
991; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
992; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
993; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
994; SKIP-CACHE-INV-NEXT:    s_endpgm
995;
996; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
997; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
998; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
999; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1000; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1001; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1002; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1003; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1004; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1005;
1006; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
1007; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1008; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1009; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1010; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1012; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1013; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1014; GFX90A-TGSPLIT-NEXT:    s_endpgm
1015;
1016; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
1017; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1018; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1019; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1020; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1021; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1022; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1023; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1024; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1025;
1026; GFX940-TGSPLIT-LABEL: flat_wavefront_release_store:
1027; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1028; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1029; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1030; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1032; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1033; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1034; GFX940-TGSPLIT-NEXT:    s_endpgm
1035;
1036; GFX11-WGP-LABEL: flat_wavefront_release_store:
1037; GFX11-WGP:       ; %bb.0: ; %entry
1038; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1039; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1040; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1041; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1042; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1043; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1044; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1045; GFX11-WGP-NEXT:    s_endpgm
1046;
1047; GFX11-CU-LABEL: flat_wavefront_release_store:
1048; GFX11-CU:       ; %bb.0: ; %entry
1049; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1050; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1051; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1053; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1054; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1055; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1056; GFX11-CU-NEXT:    s_endpgm
1057;
1058; GFX12-WGP-LABEL: flat_wavefront_release_store:
1059; GFX12-WGP:       ; %bb.0: ; %entry
1060; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1061; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1062; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1063; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1064; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1065; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1066; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
1067; GFX12-WGP-NEXT:    s_endpgm
1068;
1069; GFX12-CU-LABEL: flat_wavefront_release_store:
1070; GFX12-CU:       ; %bb.0: ; %entry
1071; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1072; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1073; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1074; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1075; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1076; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1077; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
1078; GFX12-CU-NEXT:    s_endpgm
1079    i32 %in, ptr %out) {
1080entry:
1081  store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
1082  ret void
1083}
1084
1085define amdgpu_kernel void @flat_wavefront_seq_cst_store(
1086; GFX7-LABEL: flat_wavefront_seq_cst_store:
1087; GFX7:       ; %bb.0: ; %entry
1088; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1089; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1090; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1092; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1093; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1094; GFX7-NEXT:    flat_store_dword v[0:1], v2
1095; GFX7-NEXT:    s_endpgm
1096;
1097; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
1098; GFX10-WGP:       ; %bb.0: ; %entry
1099; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
1100; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1101; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1103; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1104; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1105; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1106; GFX10-WGP-NEXT:    s_endpgm
1107;
1108; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
1109; GFX10-CU:       ; %bb.0: ; %entry
1110; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
1111; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1112; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1114; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1115; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1116; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1117; GFX10-CU-NEXT:    s_endpgm
1118;
1119; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store:
1120; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1121; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
1122; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1123; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1124; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1125; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1126; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1127; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1128; SKIP-CACHE-INV-NEXT:    s_endpgm
1129;
1130; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
1131; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1132; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1133; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1134; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1135; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1136; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1137; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1138; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1139;
1140; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
1141; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1142; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
1143; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
1144; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1146; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1147; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1148; GFX90A-TGSPLIT-NEXT:    s_endpgm
1149;
1150; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
1151; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1152; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1153; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1154; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1156; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1157; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1158; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1159;
1160; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
1161; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1162; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
1163; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1164; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1166; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1167; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1168; GFX940-TGSPLIT-NEXT:    s_endpgm
1169;
1170; GFX11-WGP-LABEL: flat_wavefront_seq_cst_store:
1171; GFX11-WGP:       ; %bb.0: ; %entry
1172; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1173; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1174; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1176; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1177; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1178; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
1179; GFX11-WGP-NEXT:    s_endpgm
1180;
1181; GFX11-CU-LABEL: flat_wavefront_seq_cst_store:
1182; GFX11-CU:       ; %bb.0: ; %entry
1183; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1184; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1185; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1186; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1187; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1188; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1189; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
1190; GFX11-CU-NEXT:    s_endpgm
1191;
1192; GFX12-WGP-LABEL: flat_wavefront_seq_cst_store:
1193; GFX12-WGP:       ; %bb.0: ; %entry
1194; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
1195; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1196; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1197; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1198; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1199; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1200; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
1201; GFX12-WGP-NEXT:    s_endpgm
1202;
1203; GFX12-CU-LABEL: flat_wavefront_seq_cst_store:
1204; GFX12-CU:       ; %bb.0: ; %entry
1205; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
1206; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
1207; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1208; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1209; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1210; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1211; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
1212; GFX12-CU-NEXT:    s_endpgm
1213    i32 %in, ptr %out) {
1214entry:
1215  store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
1216  ret void
1217}
1218
1219define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
1220; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
1221; GFX7:       ; %bb.0: ; %entry
1222; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1223; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1224; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1225; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1226; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1227; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1228; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1229; GFX7-NEXT:    s_endpgm
1230;
1231; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
1232; GFX10-WGP:       ; %bb.0: ; %entry
1233; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1234; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1235; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1237; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1238; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1239; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1240; GFX10-WGP-NEXT:    s_endpgm
1241;
1242; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
1243; GFX10-CU:       ; %bb.0: ; %entry
1244; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1245; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1246; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1247; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1248; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1249; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1250; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1251; GFX10-CU-NEXT:    s_endpgm
1252;
1253; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw:
1254; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1255; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1256; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1257; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1258; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1259; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1260; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1261; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1262; SKIP-CACHE-INV-NEXT:    s_endpgm
1263;
1264; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
1265; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1266; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1267; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1268; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1269; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1270; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1271; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1272; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1273;
1274; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
1275; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1276; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1277; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1278; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1280; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1281; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1282; GFX90A-TGSPLIT-NEXT:    s_endpgm
1283;
1284; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
1285; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1286; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1287; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1288; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1290; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1291; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1292; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1293;
1294; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
1295; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1296; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1297; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1298; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1299; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1300; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1301; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1302; GFX940-TGSPLIT-NEXT:    s_endpgm
1303;
1304; GFX11-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
1305; GFX11-WGP:       ; %bb.0: ; %entry
1306; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1307; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1308; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1310; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1311; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1312; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1313; GFX11-WGP-NEXT:    s_endpgm
1314;
1315; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
1316; GFX11-CU:       ; %bb.0: ; %entry
1317; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1318; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1319; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1320; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1321; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1322; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1323; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1324; GFX11-CU-NEXT:    s_endpgm
1325;
1326; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
1327; GFX12-WGP:       ; %bb.0: ; %entry
1328; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1329; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1330; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1331; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1332; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1333; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1334; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1335; GFX12-WGP-NEXT:    s_endpgm
1336;
1337; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
1338; GFX12-CU:       ; %bb.0: ; %entry
1339; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1340; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1341; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1342; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1343; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1344; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1345; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1346; GFX12-CU-NEXT:    s_endpgm
1347    ptr %out, i32 %in) {
1348entry:
1349  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
1350  ret void
1351}
1352
1353define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
1354; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
1355; GFX7:       ; %bb.0: ; %entry
1356; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1357; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1358; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1359; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1360; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1361; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1362; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1363; GFX7-NEXT:    s_endpgm
1364;
1365; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
1366; GFX10-WGP:       ; %bb.0: ; %entry
1367; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1368; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1369; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1371; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1372; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1373; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1374; GFX10-WGP-NEXT:    s_endpgm
1375;
1376; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
1377; GFX10-CU:       ; %bb.0: ; %entry
1378; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1379; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1380; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1382; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1383; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1384; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1385; GFX10-CU-NEXT:    s_endpgm
1386;
1387; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw:
1388; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1389; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1390; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1391; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1392; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1393; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1394; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1395; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1396; SKIP-CACHE-INV-NEXT:    s_endpgm
1397;
1398; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
1399; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1400; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1401; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1402; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1404; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1405; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1406; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1407;
1408; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
1409; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1410; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1411; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1412; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1413; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1414; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1415; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1416; GFX90A-TGSPLIT-NEXT:    s_endpgm
1417;
1418; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
1419; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1420; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1421; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1422; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1423; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1424; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1425; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1426; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1427;
1428; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
1429; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1430; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1431; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1432; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1434; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1435; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1436; GFX940-TGSPLIT-NEXT:    s_endpgm
1437;
1438; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
1439; GFX11-WGP:       ; %bb.0: ; %entry
1440; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1441; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1442; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1444; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1445; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1446; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1447; GFX11-WGP-NEXT:    s_endpgm
1448;
1449; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw:
1450; GFX11-CU:       ; %bb.0: ; %entry
1451; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1452; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1453; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1454; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1455; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1456; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1457; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1458; GFX11-CU-NEXT:    s_endpgm
1459;
1460; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
1461; GFX12-WGP:       ; %bb.0: ; %entry
1462; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1463; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1464; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1465; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1466; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1467; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1468; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1469; GFX12-WGP-NEXT:    s_endpgm
1470;
1471; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw:
1472; GFX12-CU:       ; %bb.0: ; %entry
1473; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1474; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1475; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1476; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1477; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1478; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1479; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1480; GFX12-CU-NEXT:    s_endpgm
1481    ptr %out, i32 %in) {
1482entry:
1483  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
1484  ret void
1485}
1486
1487define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
1488; GFX7-LABEL: flat_wavefront_release_atomicrmw:
1489; GFX7:       ; %bb.0: ; %entry
1490; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1491; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1492; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1493; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1494; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1495; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1496; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1497; GFX7-NEXT:    s_endpgm
1498;
1499; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
1500; GFX10-WGP:       ; %bb.0: ; %entry
1501; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1502; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1503; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1504; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1505; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1506; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1507; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1508; GFX10-WGP-NEXT:    s_endpgm
1509;
1510; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
1511; GFX10-CU:       ; %bb.0: ; %entry
1512; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1513; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1514; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1516; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1517; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1518; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1519; GFX10-CU-NEXT:    s_endpgm
1520;
1521; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw:
1522; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1523; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1524; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1525; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1526; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1529; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1530; SKIP-CACHE-INV-NEXT:    s_endpgm
1531;
1532; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
1533; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1534; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1535; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1536; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1537; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1538; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1539; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1540; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1541;
1542; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
1543; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1544; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1545; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1546; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1548; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1549; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1550; GFX90A-TGSPLIT-NEXT:    s_endpgm
1551;
1552; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
1553; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1554; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1555; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1556; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1557; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1558; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1559; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1560; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1561;
1562; GFX940-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
1563; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1564; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1565; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1566; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1567; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1568; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1569; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1570; GFX940-TGSPLIT-NEXT:    s_endpgm
1571;
1572; GFX11-WGP-LABEL: flat_wavefront_release_atomicrmw:
1573; GFX11-WGP:       ; %bb.0: ; %entry
1574; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1575; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1576; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1577; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1578; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1579; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1580; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1581; GFX11-WGP-NEXT:    s_endpgm
1582;
1583; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw:
1584; GFX11-CU:       ; %bb.0: ; %entry
1585; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1586; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1587; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1588; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1589; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1590; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1591; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1592; GFX11-CU-NEXT:    s_endpgm
1593;
1594; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw:
1595; GFX12-WGP:       ; %bb.0: ; %entry
1596; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1597; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1598; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1599; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1600; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1601; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1602; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1603; GFX12-WGP-NEXT:    s_endpgm
1604;
1605; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw:
1606; GFX12-CU:       ; %bb.0: ; %entry
1607; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1608; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1609; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1610; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1611; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1612; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1613; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1614; GFX12-CU-NEXT:    s_endpgm
1615    ptr %out, i32 %in) {
1616entry:
1617  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
1618  ret void
1619}
1620
1621define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
1622; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
1623; GFX7:       ; %bb.0: ; %entry
1624; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1625; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1626; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1627; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1628; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1629; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1630; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1631; GFX7-NEXT:    s_endpgm
1632;
1633; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
1634; GFX10-WGP:       ; %bb.0: ; %entry
1635; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1636; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1637; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1638; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1639; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1640; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1641; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1642; GFX10-WGP-NEXT:    s_endpgm
1643;
1644; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
1645; GFX10-CU:       ; %bb.0: ; %entry
1646; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1647; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1648; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1650; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1651; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1652; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1653; GFX10-CU-NEXT:    s_endpgm
1654;
1655; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw:
1656; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1657; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1658; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1659; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1660; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1661; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1662; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1663; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1664; SKIP-CACHE-INV-NEXT:    s_endpgm
1665;
1666; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
1667; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1668; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1669; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1670; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1671; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1672; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1673; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1674; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1675;
1676; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
1677; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1678; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1679; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1680; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1681; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1682; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1683; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1684; GFX90A-TGSPLIT-NEXT:    s_endpgm
1685;
1686; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
1687; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1688; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1689; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1690; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1691; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1692; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1693; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1694; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1695;
1696; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
1697; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1698; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1699; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1700; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1701; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1702; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1703; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1704; GFX940-TGSPLIT-NEXT:    s_endpgm
1705;
1706; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
1707; GFX11-WGP:       ; %bb.0: ; %entry
1708; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1709; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1710; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1712; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1713; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1714; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1715; GFX11-WGP-NEXT:    s_endpgm
1716;
1717; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
1718; GFX11-CU:       ; %bb.0: ; %entry
1719; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1720; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1721; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1722; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1723; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1724; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1725; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1726; GFX11-CU-NEXT:    s_endpgm
1727;
1728; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
1729; GFX12-WGP:       ; %bb.0: ; %entry
1730; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1731; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1732; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1733; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1734; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1735; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1736; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1737; GFX12-WGP-NEXT:    s_endpgm
1738;
1739; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
1740; GFX12-CU:       ; %bb.0: ; %entry
1741; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1742; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1743; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1744; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1745; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1746; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1747; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1748; GFX12-CU-NEXT:    s_endpgm
1749    ptr %out, i32 %in) {
1750entry:
1751  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
1752  ret void
1753}
1754
1755define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
1756; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
1757; GFX7:       ; %bb.0: ; %entry
1758; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1759; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1760; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1761; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1762; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1763; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1764; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1765; GFX7-NEXT:    s_endpgm
1766;
1767; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
1768; GFX10-WGP:       ; %bb.0: ; %entry
1769; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1770; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
1771; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1772; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
1773; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
1774; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
1775; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
1776; GFX10-WGP-NEXT:    s_endpgm
1777;
1778; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
1779; GFX10-CU:       ; %bb.0: ; %entry
1780; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1781; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
1782; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
1784; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
1785; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
1786; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
1787; GFX10-CU-NEXT:    s_endpgm
1788;
1789; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw:
1790; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1791; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1792; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
1793; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1794; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1795; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1796; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1797; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
1798; SKIP-CACHE-INV-NEXT:    s_endpgm
1799;
1800; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
1801; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1802; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1803; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1804; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1806; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1807; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1808; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1809;
1810; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
1811; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1812; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1813; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
1814; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1815; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1816; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
1817; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1818; GFX90A-TGSPLIT-NEXT:    s_endpgm
1819;
1820; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
1821; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1822; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1823; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1824; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1826; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1827; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1828; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1829;
1830; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
1831; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1832; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1833; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
1834; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
1836; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
1837; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
1838; GFX940-TGSPLIT-NEXT:    s_endpgm
1839;
1840; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
1841; GFX11-WGP:       ; %bb.0: ; %entry
1842; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1843; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1844; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1845; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
1846; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
1847; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
1848; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1849; GFX11-WGP-NEXT:    s_endpgm
1850;
1851; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
1852; GFX11-CU:       ; %bb.0: ; %entry
1853; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1854; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1855; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1856; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
1857; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
1858; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
1859; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1860; GFX11-CU-NEXT:    s_endpgm
1861;
1862; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
1863; GFX12-WGP:       ; %bb.0: ; %entry
1864; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1865; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
1866; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1867; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
1868; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
1869; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
1870; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1871; GFX12-WGP-NEXT:    s_endpgm
1872;
1873; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
1874; GFX12-CU:       ; %bb.0: ; %entry
1875; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
1876; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
1877; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1878; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
1879; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
1880; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
1881; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
1882; GFX12-CU-NEXT:    s_endpgm
1883    ptr %out, i32 %in) {
1884entry:
1885  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
1886  ret void
1887}
1888
1889define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
1890; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1891; GFX7:       ; %bb.0: ; %entry
1892; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1893; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
1894; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1895; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1896; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1897; GFX7-NEXT:    v_mov_b32_e32 v2, s6
1898; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1899; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1900; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1901; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1902; GFX7-NEXT:    flat_store_dword v[0:1], v2
1903; GFX7-NEXT:    s_endpgm
1904;
1905; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1906; GFX10-WGP:       ; %bb.0: ; %entry
1907; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1908; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1909; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
1911; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
1912; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
1913; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1914; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
1915; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
1916; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1917; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1918; GFX10-WGP-NEXT:    s_endpgm
1919;
1920; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1921; GFX10-CU:       ; %bb.0: ; %entry
1922; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1923; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
1924; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
1926; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
1927; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
1928; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1929; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
1930; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
1931; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1932; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1933; GFX10-CU-NEXT:    s_endpgm
1934;
1935; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1936; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1937; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1938; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
1939; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1941; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
1943; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1944; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1945; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1946; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1947; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1948; SKIP-CACHE-INV-NEXT:    s_endpgm
1949;
1950; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1951; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1952; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1953; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1954; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1956; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
1957; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1958; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1959; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1960; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1961; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1962;
1963; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1964; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1965; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1966; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1967; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1969; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
1970; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1971; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1972; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1973; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
1974; GFX90A-TGSPLIT-NEXT:    s_endpgm
1975;
1976; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1977; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1978; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1979; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1980; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1981; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1982; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1983; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1984; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1985; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1986; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
1987; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1988;
1989; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
1990; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1991; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1992; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1993; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1995; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
1996; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
1997; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1998; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
1999; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2000; GFX940-TGSPLIT-NEXT:    s_endpgm
2001;
2002; GFX11-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
2003; GFX11-WGP:       ; %bb.0: ; %entry
2004; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2005; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2006; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2008; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2009; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2010; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2011; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2012; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2013; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2014; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2015; GFX11-WGP-NEXT:    s_endpgm
2016;
2017; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
2018; GFX11-CU:       ; %bb.0: ; %entry
2019; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2020; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2021; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2022; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2023; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2024; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2025; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2026; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2027; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2028; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2029; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2030; GFX11-CU-NEXT:    s_endpgm
2031;
2032; GFX12-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
2033; GFX12-WGP:       ; %bb.0: ; %entry
2034; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2035; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2036; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2037; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2038; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2039; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2040; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2041; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2042; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2043; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2044; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2045; GFX12-WGP-NEXT:    s_endpgm
2046;
2047; GFX12-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
2048; GFX12-CU:       ; %bb.0: ; %entry
2049; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2050; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2051; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2052; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2053; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2054; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2055; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2056; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2057; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2058; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2059; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2060; GFX12-CU-NEXT:    s_endpgm
2061    ptr %out, i32 %in) {
2062entry:
2063  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
2064  store i32 %val, ptr %out, align 4
2065  ret void
2066}
2067
2068define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
2069; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2070; GFX7:       ; %bb.0: ; %entry
2071; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2072; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2073; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2074; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2075; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2076; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2077; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2078; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2079; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2080; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2081; GFX7-NEXT:    flat_store_dword v[0:1], v2
2082; GFX7-NEXT:    s_endpgm
2083;
2084; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2085; GFX10-WGP:       ; %bb.0: ; %entry
2086; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2087; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2088; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2089; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2090; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2091; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2092; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2093; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2094; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2095; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2096; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2097; GFX10-WGP-NEXT:    s_endpgm
2098;
2099; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2100; GFX10-CU:       ; %bb.0: ; %entry
2101; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2102; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2103; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2104; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2105; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2106; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2107; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2108; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2109; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2110; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2111; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2112; GFX10-CU-NEXT:    s_endpgm
2113;
2114; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2115; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2116; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2117; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2118; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2119; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2122; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2123; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2124; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2125; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2126; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2127; SKIP-CACHE-INV-NEXT:    s_endpgm
2128;
2129; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2130; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2131; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2132; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2133; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2134; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2135; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2136; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2137; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2138; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2139; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2140; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2141;
2142; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2143; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2144; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2145; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2146; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2147; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2148; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2149; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2150; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2151; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2152; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2153; GFX90A-TGSPLIT-NEXT:    s_endpgm
2154;
2155; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2156; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2157; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2158; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2159; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2161; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2162; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2163; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2164; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2165; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2166; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2167;
2168; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2169; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2170; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2171; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2172; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2173; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2174; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2175; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2176; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2177; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2178; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2179; GFX940-TGSPLIT-NEXT:    s_endpgm
2180;
2181; GFX11-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2182; GFX11-WGP:       ; %bb.0: ; %entry
2183; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2184; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2185; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2187; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2188; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2189; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2190; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2191; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2192; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2193; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2194; GFX11-WGP-NEXT:    s_endpgm
2195;
2196; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2197; GFX11-CU:       ; %bb.0: ; %entry
2198; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2199; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2200; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2201; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2202; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2203; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2204; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2205; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2206; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2207; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2208; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2209; GFX11-CU-NEXT:    s_endpgm
2210;
2211; GFX12-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2212; GFX12-WGP:       ; %bb.0: ; %entry
2213; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2214; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2215; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2216; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2217; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2218; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2219; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2220; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2221; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2222; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2223; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2224; GFX12-WGP-NEXT:    s_endpgm
2225;
2226; GFX12-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
2227; GFX12-CU:       ; %bb.0: ; %entry
2228; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2229; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2230; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2231; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2232; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2233; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2234; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2235; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2236; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2237; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2238; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2239; GFX12-CU-NEXT:    s_endpgm
2240    ptr %out, i32 %in) {
2241entry:
2242  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
2243  store i32 %val, ptr %out, align 4
2244  ret void
2245}
2246
2247define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
2248; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2249; GFX7:       ; %bb.0: ; %entry
2250; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2251; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2252; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2253; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2254; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2255; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2256; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2257; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2258; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2259; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2260; GFX7-NEXT:    flat_store_dword v[0:1], v2
2261; GFX7-NEXT:    s_endpgm
2262;
2263; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2264; GFX10-WGP:       ; %bb.0: ; %entry
2265; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2266; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2267; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2268; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2269; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2270; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
2271; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2272; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2273; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2274; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2275; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2276; GFX10-WGP-NEXT:    s_endpgm
2277;
2278; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2279; GFX10-CU:       ; %bb.0: ; %entry
2280; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2281; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2282; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2283; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2284; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2285; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
2286; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2287; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2288; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2289; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2290; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2291; GFX10-CU-NEXT:    s_endpgm
2292;
2293; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2294; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2295; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2296; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
2297; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2298; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2299; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2301; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2302; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2303; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2304; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2305; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2306; SKIP-CACHE-INV-NEXT:    s_endpgm
2307;
2308; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2309; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2310; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2311; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2312; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2313; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2314; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2315; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2316; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2317; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2318; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2319; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2320;
2321; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2322; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2323; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2324; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2325; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2327; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
2328; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2329; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2330; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2331; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
2332; GFX90A-TGSPLIT-NEXT:    s_endpgm
2333;
2334; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2335; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2336; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2337; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2338; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2339; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2340; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2341; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2342; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2343; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2344; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2345; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2346;
2347; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2348; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2349; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2350; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2351; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2352; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2353; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
2354; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
2355; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2356; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2357; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
2358; GFX940-TGSPLIT-NEXT:    s_endpgm
2359;
2360; GFX11-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2361; GFX11-WGP:       ; %bb.0: ; %entry
2362; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2363; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2364; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2365; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2366; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2367; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
2368; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2369; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2370; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2371; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2372; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
2373; GFX11-WGP-NEXT:    s_endpgm
2374;
2375; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2376; GFX11-CU:       ; %bb.0: ; %entry
2377; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2378; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2379; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2380; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2381; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2382; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
2383; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
2384; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2385; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2386; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2387; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
2388; GFX11-CU-NEXT:    s_endpgm
2389;
2390; GFX12-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2391; GFX12-WGP:       ; %bb.0: ; %entry
2392; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2393; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2394; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2395; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2396; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2397; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
2398; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2399; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2400; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2401; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
2402; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
2403; GFX12-WGP-NEXT:    s_endpgm
2404;
2405; GFX12-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
2406; GFX12-CU:       ; %bb.0: ; %entry
2407; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2408; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2409; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2410; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2411; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2412; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
2413; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2414; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2415; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2416; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
2417; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
2418; GFX12-CU-NEXT:    s_endpgm
2419    ptr %out, i32 %in) {
2420entry:
2421  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
2422  store i32 %val, ptr %out, align 4
2423  ret void
2424}
2425
2426define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
2427; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2428; GFX7:       ; %bb.0: ; %entry
2429; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2430; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2431; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2432; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2433; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2434; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2435; GFX7-NEXT:    s_mov_b32 s4, s8
2436; GFX7-NEXT:    s_mov_b32 s5, s9
2437; GFX7-NEXT:    s_mov_b32 s9, s10
2438; GFX7-NEXT:    s_mov_b32 s8, s11
2439; GFX7-NEXT:    s_add_u32 s4, s4, s9
2440; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2441; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2442; GFX7-NEXT:    s_mov_b32 s5, s8
2443; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2444; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2445; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2446; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2447; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2448; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2449; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2450; GFX7-NEXT:    s_endpgm
2451;
2452; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2453; GFX10-WGP:       ; %bb.0: ; %entry
2454; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2455; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2456; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2457; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2458; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2459; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2460; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2461; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2462; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2463; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2464; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2465; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2466; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2467; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2468; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2469; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2470; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2471; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2472; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2473; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2474; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2475; GFX10-WGP-NEXT:    s_endpgm
2476;
2477; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2478; GFX10-CU:       ; %bb.0: ; %entry
2479; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2480; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2481; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2482; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2483; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2484; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2485; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2486; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2487; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2488; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2489; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2490; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2491; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2492; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2493; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2494; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2495; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2496; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2497; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2498; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2499; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2500; GFX10-CU-NEXT:    s_endpgm
2501;
2502; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2503; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2504; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2505; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2506; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2507; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2508; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2509; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2510; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2511; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2512; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
2513; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
2514; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
2515; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
2516; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2517; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
2518; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2519; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2520; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2521; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
2522; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2523; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2524; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2525; SKIP-CACHE-INV-NEXT:    s_endpgm
2526;
2527; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2528; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2529; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2530; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2531; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2532; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2534; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2535; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2536; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2537; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2538; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2539; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2540;
2541; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2542; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2543; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2544; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2545; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2546; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2547; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2548; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2549; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2550; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2551; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2552; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2553; GFX90A-TGSPLIT-NEXT:    s_endpgm
2554;
2555; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2556; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2557; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2558; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2559; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2560; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2561; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2562; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2563; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2564; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2565; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2566; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2567; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2568;
2569; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2570; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2571; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2572; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2573; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2574; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2575; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2576; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2577; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2578; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2579; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2580; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2581; GFX940-TGSPLIT-NEXT:    s_endpgm
2582;
2583; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2584; GFX11-WGP:       ; %bb.0: ; %entry
2585; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2586; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2587; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2588; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2589; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
2590; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
2591; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2592; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
2593; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2594; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2595; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2596; GFX11-WGP-NEXT:    s_endpgm
2597;
2598; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2599; GFX11-CU:       ; %bb.0: ; %entry
2600; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2601; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2602; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2603; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
2605; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
2606; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2607; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
2608; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2609; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2610; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2611; GFX11-CU-NEXT:    s_endpgm
2612;
2613; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2614; GFX12-WGP:       ; %bb.0: ; %entry
2615; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2616; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2617; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2618; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2619; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
2620; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
2621; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2622; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
2623; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2624; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2625; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2626; GFX12-WGP-NEXT:    s_endpgm
2627;
2628; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
2629; GFX12-CU:       ; %bb.0: ; %entry
2630; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2631; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2632; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2633; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2634; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
2635; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
2636; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2637; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
2638; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2639; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2640; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2641; GFX12-CU-NEXT:    s_endpgm
2642    ptr %out, i32 %in, i32 %old) {
2643entry:
2644  %gep = getelementptr i32, ptr %out, i32 4
2645  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
2646  ret void
2647}
2648
2649define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
2650; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2651; GFX7:       ; %bb.0: ; %entry
2652; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2653; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2654; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2655; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2656; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2657; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX7-NEXT:    s_mov_b32 s4, s8
2659; GFX7-NEXT:    s_mov_b32 s5, s9
2660; GFX7-NEXT:    s_mov_b32 s9, s10
2661; GFX7-NEXT:    s_mov_b32 s8, s11
2662; GFX7-NEXT:    s_add_u32 s4, s4, s9
2663; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2664; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2665; GFX7-NEXT:    s_mov_b32 s5, s8
2666; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2667; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2668; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2669; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2670; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2671; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2672; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2673; GFX7-NEXT:    s_endpgm
2674;
2675; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2676; GFX10-WGP:       ; %bb.0: ; %entry
2677; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2678; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2679; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2680; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2681; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2682; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2683; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2684; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2685; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2686; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2687; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2688; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2689; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2690; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2691; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2692; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2693; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2694; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2695; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2696; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2697; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2698; GFX10-WGP-NEXT:    s_endpgm
2699;
2700; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2701; GFX10-CU:       ; %bb.0: ; %entry
2702; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2703; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2704; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2705; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2706; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2707; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2708; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2709; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2710; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2711; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2712; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2713; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2714; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2715; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2716; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2717; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2718; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2719; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2720; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2721; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2722; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2723; GFX10-CU-NEXT:    s_endpgm
2724;
2725; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2726; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2727; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2728; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2729; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2730; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2731; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2732; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2733; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2734; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2735; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
2736; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
2737; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
2738; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
2739; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2740; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
2741; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2742; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2743; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2744; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
2745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2746; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2747; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2748; SKIP-CACHE-INV-NEXT:    s_endpgm
2749;
2750; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2751; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2752; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2753; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2754; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2755; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2756; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2757; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2758; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2759; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2760; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2761; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2762; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2763;
2764; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2765; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2766; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2767; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2768; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2769; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2770; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2771; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2772; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2773; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2774; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2775; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2776; GFX90A-TGSPLIT-NEXT:    s_endpgm
2777;
2778; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2779; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2780; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2781; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2782; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2783; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2784; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2785; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2786; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2787; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2788; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2789; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2790; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2791;
2792; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2793; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2794; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2795; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2796; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2797; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2798; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2799; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
2800; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2801; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2802; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2803; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2804; GFX940-TGSPLIT-NEXT:    s_endpgm
2805;
2806; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2807; GFX11-WGP:       ; %bb.0: ; %entry
2808; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2809; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2810; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2811; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2812; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
2813; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
2814; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2815; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
2816; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
2817; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
2818; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2819; GFX11-WGP-NEXT:    s_endpgm
2820;
2821; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2822; GFX11-CU:       ; %bb.0: ; %entry
2823; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2824; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2825; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2826; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
2828; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
2829; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2830; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
2831; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
2832; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
2833; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2834; GFX11-CU-NEXT:    s_endpgm
2835;
2836; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2837; GFX12-WGP:       ; %bb.0: ; %entry
2838; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2839; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2840; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2841; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2842; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
2843; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
2844; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2845; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
2846; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
2847; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
2848; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2849; GFX12-WGP-NEXT:    s_endpgm
2850;
2851; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
2852; GFX12-CU:       ; %bb.0: ; %entry
2853; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2854; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2855; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2856; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2857; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
2858; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
2859; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2860; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
2861; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
2862; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
2863; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2864; GFX12-CU-NEXT:    s_endpgm
2865    ptr %out, i32 %in, i32 %old) {
2866entry:
2867  %gep = getelementptr i32, ptr %out, i32 4
2868  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
2869  ret void
2870}
2871
2872define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
2873; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2874; GFX7:       ; %bb.0: ; %entry
2875; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2876; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2877; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2878; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2879; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2880; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2881; GFX7-NEXT:    s_mov_b32 s4, s8
2882; GFX7-NEXT:    s_mov_b32 s5, s9
2883; GFX7-NEXT:    s_mov_b32 s9, s10
2884; GFX7-NEXT:    s_mov_b32 s8, s11
2885; GFX7-NEXT:    s_add_u32 s4, s4, s9
2886; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2887; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2888; GFX7-NEXT:    s_mov_b32 s5, s8
2889; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2890; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2891; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2892; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2893; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2894; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2895; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2896; GFX7-NEXT:    s_endpgm
2897;
2898; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2899; GFX10-WGP:       ; %bb.0: ; %entry
2900; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
2901; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2902; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
2903; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
2904; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
2905; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2906; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
2907; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
2908; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
2909; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
2910; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
2911; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
2912; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2913; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
2914; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
2915; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
2916; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2917; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
2918; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2919; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2920; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2921; GFX10-WGP-NEXT:    s_endpgm
2922;
2923; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2924; GFX10-CU:       ; %bb.0: ; %entry
2925; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
2926; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2927; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
2928; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
2929; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
2930; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2931; GFX10-CU-NEXT:    s_mov_b32 s4, s8
2932; GFX10-CU-NEXT:    s_mov_b32 s5, s9
2933; GFX10-CU-NEXT:    s_mov_b32 s9, s10
2934; GFX10-CU-NEXT:    s_mov_b32 s8, s11
2935; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
2936; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
2937; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2938; GFX10-CU-NEXT:    s_mov_b32 s5, s8
2939; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
2940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
2941; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2942; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
2943; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2944; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2945; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2946; GFX10-CU-NEXT:    s_endpgm
2947;
2948; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2949; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2950; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
2951; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
2952; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
2953; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
2954; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
2955; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2957; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2958; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
2959; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
2960; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
2961; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
2962; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2963; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
2964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
2965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2966; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2967; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
2968; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2969; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2970; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2971; SKIP-CACHE-INV-NEXT:    s_endpgm
2972;
2973; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2974; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2975; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2976; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2977; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2978; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2979; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2980; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2981; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2982; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2983; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2984; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2985; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2986;
2987; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
2988; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2989; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2990; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2991; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2992; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2993; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2994; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
2995; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2996; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
2997; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2998; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2999; GFX90A-TGSPLIT-NEXT:    s_endpgm
3000;
3001; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3002; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3003; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3004; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3005; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3006; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3007; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3008; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3009; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3010; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3011; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3012; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3013; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3014;
3015; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3016; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3017; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3018; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3019; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3020; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3021; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3022; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3023; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3024; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3025; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3026; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3027; GFX940-TGSPLIT-NEXT:    s_endpgm
3028;
3029; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3030; GFX11-WGP:       ; %bb.0: ; %entry
3031; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3032; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3033; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3034; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3035; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3036; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3037; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3038; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3039; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3040; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3041; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3042; GFX11-WGP-NEXT:    s_endpgm
3043;
3044; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3045; GFX11-CU:       ; %bb.0: ; %entry
3046; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3047; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3048; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3049; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3051; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3052; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3053; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3054; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3055; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3056; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3057; GFX11-CU-NEXT:    s_endpgm
3058;
3059; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3060; GFX12-WGP:       ; %bb.0: ; %entry
3061; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3062; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3063; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3064; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3065; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3066; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3067; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3068; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3069; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3070; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3071; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3072; GFX12-WGP-NEXT:    s_endpgm
3073;
3074; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
3075; GFX12-CU:       ; %bb.0: ; %entry
3076; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3077; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3078; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3079; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3080; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3081; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3082; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3083; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3084; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3085; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3086; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3087; GFX12-CU-NEXT:    s_endpgm
3088    ptr %out, i32 %in, i32 %old) {
3089entry:
3090  %gep = getelementptr i32, ptr %out, i32 4
3091  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
3092  ret void
3093}
3094
3095define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
3096; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3097; GFX7:       ; %bb.0: ; %entry
3098; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3099; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3100; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3101; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3102; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3103; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3104; GFX7-NEXT:    s_mov_b32 s4, s8
3105; GFX7-NEXT:    s_mov_b32 s5, s9
3106; GFX7-NEXT:    s_mov_b32 s9, s10
3107; GFX7-NEXT:    s_mov_b32 s8, s11
3108; GFX7-NEXT:    s_add_u32 s4, s4, s9
3109; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3110; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3111; GFX7-NEXT:    s_mov_b32 s5, s8
3112; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3113; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3114; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3115; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3116; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3117; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3118; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3119; GFX7-NEXT:    s_endpgm
3120;
3121; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3122; GFX10-WGP:       ; %bb.0: ; %entry
3123; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3124; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3125; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3126; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3127; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3128; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3129; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3130; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3131; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3132; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3133; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3134; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3135; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3136; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3137; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3138; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3139; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3140; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3141; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3142; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3143; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3144; GFX10-WGP-NEXT:    s_endpgm
3145;
3146; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3147; GFX10-CU:       ; %bb.0: ; %entry
3148; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3149; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3150; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3151; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3152; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3153; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3155; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3156; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3157; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3158; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3159; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3160; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3161; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3162; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3163; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3164; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3165; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3166; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3167; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3168; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3169; GFX10-CU-NEXT:    s_endpgm
3170;
3171; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3172; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3173; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3174; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3175; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3176; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3177; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3178; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3179; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3180; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3181; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3182; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3183; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3184; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3185; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3186; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3187; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3189; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3190; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3191; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3192; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3193; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3194; SKIP-CACHE-INV-NEXT:    s_endpgm
3195;
3196; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3197; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3198; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3199; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3200; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3201; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3202; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3203; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3204; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3205; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3206; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3207; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3208; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3209;
3210; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3211; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3212; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3213; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3214; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3215; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3216; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3217; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3218; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3219; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3220; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3221; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3222; GFX90A-TGSPLIT-NEXT:    s_endpgm
3223;
3224; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3225; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3226; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3227; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3228; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3229; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3230; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3231; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3232; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3233; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3234; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3235; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3236; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3237;
3238; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3239; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3240; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3241; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3242; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3243; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3245; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3246; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3247; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3248; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3249; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3250; GFX940-TGSPLIT-NEXT:    s_endpgm
3251;
3252; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3253; GFX11-WGP:       ; %bb.0: ; %entry
3254; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3255; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3256; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3257; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3258; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3259; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3260; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3261; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3262; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3263; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3264; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3265; GFX11-WGP-NEXT:    s_endpgm
3266;
3267; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3268; GFX11-CU:       ; %bb.0: ; %entry
3269; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3270; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3271; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3272; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3273; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3274; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3275; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3276; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3277; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3278; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3279; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3280; GFX11-CU-NEXT:    s_endpgm
3281;
3282; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3283; GFX12-WGP:       ; %bb.0: ; %entry
3284; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3285; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3286; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3287; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3288; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3289; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3290; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3291; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3292; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3293; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3294; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3295; GFX12-WGP-NEXT:    s_endpgm
3296;
3297; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
3298; GFX12-CU:       ; %bb.0: ; %entry
3299; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3300; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3301; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3302; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3303; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3304; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3305; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3306; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3307; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3308; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3309; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3310; GFX12-CU-NEXT:    s_endpgm
3311    ptr %out, i32 %in, i32 %old) {
3312entry:
3313  %gep = getelementptr i32, ptr %out, i32 4
3314  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
3315  ret void
3316}
3317
3318define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
3319; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3320; GFX7:       ; %bb.0: ; %entry
3321; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3322; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3323; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3324; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3325; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3326; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3327; GFX7-NEXT:    s_mov_b32 s4, s8
3328; GFX7-NEXT:    s_mov_b32 s5, s9
3329; GFX7-NEXT:    s_mov_b32 s9, s10
3330; GFX7-NEXT:    s_mov_b32 s8, s11
3331; GFX7-NEXT:    s_add_u32 s4, s4, s9
3332; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3333; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3334; GFX7-NEXT:    s_mov_b32 s5, s8
3335; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3336; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3337; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3338; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3339; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3340; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3341; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3342; GFX7-NEXT:    s_endpgm
3343;
3344; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3345; GFX10-WGP:       ; %bb.0: ; %entry
3346; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3347; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3348; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3349; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3350; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3351; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3352; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3353; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3354; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3355; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3356; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3357; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3358; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3359; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3360; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3361; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3362; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3363; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3364; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3365; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3366; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3367; GFX10-WGP-NEXT:    s_endpgm
3368;
3369; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3370; GFX10-CU:       ; %bb.0: ; %entry
3371; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3372; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3373; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3374; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3375; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3376; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3377; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3378; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3379; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3380; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3381; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3382; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3383; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3384; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3385; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3387; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3388; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3389; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3390; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3391; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3392; GFX10-CU-NEXT:    s_endpgm
3393;
3394; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3395; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3396; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3397; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3398; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3399; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3400; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3401; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3402; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3403; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3404; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3405; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3406; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3407; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3408; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3410; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3411; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3412; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3413; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3414; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3415; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3416; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3417; SKIP-CACHE-INV-NEXT:    s_endpgm
3418;
3419; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3420; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3421; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3422; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3423; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3424; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3425; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3426; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3427; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3428; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3429; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3430; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3431; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3432;
3433; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3434; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3435; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3436; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3437; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3438; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3439; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3440; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3441; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3442; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3443; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3444; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3445; GFX90A-TGSPLIT-NEXT:    s_endpgm
3446;
3447; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3448; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3449; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3450; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3451; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3452; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3453; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3454; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3455; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3456; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3457; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3458; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3459; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3460;
3461; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3462; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3463; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3464; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3465; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3466; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3467; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3468; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3469; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3470; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3471; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3472; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3473; GFX940-TGSPLIT-NEXT:    s_endpgm
3474;
3475; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3476; GFX11-WGP:       ; %bb.0: ; %entry
3477; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3478; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3479; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3480; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3481; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3482; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3483; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3484; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3485; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3486; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3487; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3488; GFX11-WGP-NEXT:    s_endpgm
3489;
3490; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3491; GFX11-CU:       ; %bb.0: ; %entry
3492; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3493; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3494; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3495; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3496; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3497; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3498; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3499; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3500; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3501; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3502; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3503; GFX11-CU-NEXT:    s_endpgm
3504;
3505; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3506; GFX12-WGP:       ; %bb.0: ; %entry
3507; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3508; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3509; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3510; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3511; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3512; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3513; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3514; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3515; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3516; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3517; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3518; GFX12-WGP-NEXT:    s_endpgm
3519;
3520; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
3521; GFX12-CU:       ; %bb.0: ; %entry
3522; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3523; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3524; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3525; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3526; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3527; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3528; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3529; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3530; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3531; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3532; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3533; GFX12-CU-NEXT:    s_endpgm
3534    ptr %out, i32 %in, i32 %old) {
3535entry:
3536  %gep = getelementptr i32, ptr %out, i32 4
3537  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
3538  ret void
3539}
3540
3541define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
3542; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3543; GFX7:       ; %bb.0: ; %entry
3544; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3545; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3546; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3547; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3548; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3549; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3550; GFX7-NEXT:    s_mov_b32 s4, s8
3551; GFX7-NEXT:    s_mov_b32 s5, s9
3552; GFX7-NEXT:    s_mov_b32 s9, s10
3553; GFX7-NEXT:    s_mov_b32 s8, s11
3554; GFX7-NEXT:    s_add_u32 s4, s4, s9
3555; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3556; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3557; GFX7-NEXT:    s_mov_b32 s5, s8
3558; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3559; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3560; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3561; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3562; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3563; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3564; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3565; GFX7-NEXT:    s_endpgm
3566;
3567; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3568; GFX10-WGP:       ; %bb.0: ; %entry
3569; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3570; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3571; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3572; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3573; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3574; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3575; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3576; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3577; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3578; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3579; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3580; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3581; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3582; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3583; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3584; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3585; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3586; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3587; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3588; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3589; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3590; GFX10-WGP-NEXT:    s_endpgm
3591;
3592; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3593; GFX10-CU:       ; %bb.0: ; %entry
3594; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3595; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3596; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3597; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3598; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3599; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3600; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3601; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3602; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3603; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3604; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3605; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3606; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3607; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3608; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3609; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3610; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3611; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3612; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3613; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3614; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3615; GFX10-CU-NEXT:    s_endpgm
3616;
3617; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3618; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3619; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3620; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3621; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3622; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3623; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3624; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3625; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3626; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3627; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3628; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3629; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3630; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3631; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3632; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3633; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3634; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3635; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3636; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3637; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3638; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3639; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3640; SKIP-CACHE-INV-NEXT:    s_endpgm
3641;
3642; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3643; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3644; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3645; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3646; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3647; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3649; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3650; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3651; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3652; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3653; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3654; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3655;
3656; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3657; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3658; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3659; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3660; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3661; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3662; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3663; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3664; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3665; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3666; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3667; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3668; GFX90A-TGSPLIT-NEXT:    s_endpgm
3669;
3670; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3671; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3672; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3673; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3674; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3675; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3677; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3678; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3679; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3680; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3681; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3682; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3683;
3684; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3685; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3686; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3687; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3688; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3689; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3690; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3691; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3692; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3693; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3694; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3695; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3696; GFX940-TGSPLIT-NEXT:    s_endpgm
3697;
3698; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3699; GFX11-WGP:       ; %bb.0: ; %entry
3700; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3701; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3702; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3703; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3704; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3705; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3706; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3707; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3708; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3709; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3710; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3711; GFX11-WGP-NEXT:    s_endpgm
3712;
3713; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3714; GFX11-CU:       ; %bb.0: ; %entry
3715; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3716; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3717; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3718; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3719; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3720; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3721; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3722; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3723; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3724; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3725; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3726; GFX11-CU-NEXT:    s_endpgm
3727;
3728; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3729; GFX12-WGP:       ; %bb.0: ; %entry
3730; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3731; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3732; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3733; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3734; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3735; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3736; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3737; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3738; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3739; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3740; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3741; GFX12-WGP-NEXT:    s_endpgm
3742;
3743; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
3744; GFX12-CU:       ; %bb.0: ; %entry
3745; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3746; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3747; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3748; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3749; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3750; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3751; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3752; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3753; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3754; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3755; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3756; GFX12-CU-NEXT:    s_endpgm
3757    ptr %out, i32 %in, i32 %old) {
3758entry:
3759  %gep = getelementptr i32, ptr %out, i32 4
3760  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
3761  ret void
3762}
3763
3764define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
3765; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3766; GFX7:       ; %bb.0: ; %entry
3767; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3768; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3769; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3770; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3771; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3772; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3773; GFX7-NEXT:    s_mov_b32 s4, s8
3774; GFX7-NEXT:    s_mov_b32 s5, s9
3775; GFX7-NEXT:    s_mov_b32 s9, s10
3776; GFX7-NEXT:    s_mov_b32 s8, s11
3777; GFX7-NEXT:    s_add_u32 s4, s4, s9
3778; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3779; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3780; GFX7-NEXT:    s_mov_b32 s5, s8
3781; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3782; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3783; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3784; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3785; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3786; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3787; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3788; GFX7-NEXT:    s_endpgm
3789;
3790; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3791; GFX10-WGP:       ; %bb.0: ; %entry
3792; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
3793; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3794; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
3795; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
3796; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
3797; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3798; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
3799; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
3800; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
3801; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
3802; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
3803; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
3804; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3805; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
3806; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
3807; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
3808; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3809; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
3810; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
3811; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
3812; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3813; GFX10-WGP-NEXT:    s_endpgm
3814;
3815; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3816; GFX10-CU:       ; %bb.0: ; %entry
3817; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
3818; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3819; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
3820; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
3821; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
3822; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3823; GFX10-CU-NEXT:    s_mov_b32 s4, s8
3824; GFX10-CU-NEXT:    s_mov_b32 s5, s9
3825; GFX10-CU-NEXT:    s_mov_b32 s9, s10
3826; GFX10-CU-NEXT:    s_mov_b32 s8, s11
3827; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
3828; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
3829; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3830; GFX10-CU-NEXT:    s_mov_b32 s5, s8
3831; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
3832; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
3833; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3834; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
3835; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
3836; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
3837; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3838; GFX10-CU-NEXT:    s_endpgm
3839;
3840; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3841; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3842; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
3843; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
3844; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
3845; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
3846; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
3847; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3848; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3849; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3850; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
3851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
3852; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
3853; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
3854; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3855; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
3856; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
3857; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3858; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3859; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
3860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3862; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3863; SKIP-CACHE-INV-NEXT:    s_endpgm
3864;
3865; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3866; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3867; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3868; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3869; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3870; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3871; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3872; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3873; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3874; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3875; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3876; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3877; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3878;
3879; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3880; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3881; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3882; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3883; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3884; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3885; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3886; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
3887; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3888; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3889; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3890; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3891; GFX90A-TGSPLIT-NEXT:    s_endpgm
3892;
3893; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3894; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3895; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3896; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3897; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3898; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3899; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3900; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3901; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3902; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3903; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3904; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3905; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3906;
3907; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3908; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3909; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3910; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3911; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3912; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3913; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3914; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
3915; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3916; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
3917; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
3918; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3919; GFX940-TGSPLIT-NEXT:    s_endpgm
3920;
3921; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3922; GFX11-WGP:       ; %bb.0: ; %entry
3923; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3924; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3925; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3926; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3927; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
3928; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
3929; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3930; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
3931; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
3932; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
3933; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3934; GFX11-WGP-NEXT:    s_endpgm
3935;
3936; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3937; GFX11-CU:       ; %bb.0: ; %entry
3938; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3939; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3940; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3941; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3942; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
3943; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
3944; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3945; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
3946; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
3947; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
3948; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3949; GFX11-CU-NEXT:    s_endpgm
3950;
3951; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3952; GFX12-WGP:       ; %bb.0: ; %entry
3953; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3954; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3955; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3956; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3957; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
3958; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
3959; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3960; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
3961; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
3962; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
3963; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3964; GFX12-WGP-NEXT:    s_endpgm
3965;
3966; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
3967; GFX12-CU:       ; %bb.0: ; %entry
3968; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3969; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3970; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3971; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3972; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
3973; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
3974; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3975; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
3976; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
3977; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
3978; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3979; GFX12-CU-NEXT:    s_endpgm
3980    ptr %out, i32 %in, i32 %old) {
3981entry:
3982  %gep = getelementptr i32, ptr %out, i32 4
3983  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
3984  ret void
3985}
3986
3987define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
3988; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
3989; GFX7:       ; %bb.0: ; %entry
3990; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3991; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3992; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3993; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3994; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3995; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3996; GFX7-NEXT:    s_mov_b32 s4, s8
3997; GFX7-NEXT:    s_mov_b32 s5, s9
3998; GFX7-NEXT:    s_mov_b32 s9, s10
3999; GFX7-NEXT:    s_mov_b32 s8, s11
4000; GFX7-NEXT:    s_add_u32 s4, s4, s9
4001; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4002; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4003; GFX7-NEXT:    s_mov_b32 s5, s8
4004; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4005; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4006; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4007; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4008; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4009; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4010; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4011; GFX7-NEXT:    s_endpgm
4012;
4013; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
4014; GFX10-WGP:       ; %bb.0: ; %entry
4015; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4016; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4017; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4018; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4019; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4020; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4021; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4022; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4023; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4024; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4025; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4026; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4027; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4028; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4029; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4030; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4031; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4032; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4033; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4034; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4035; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4036; GFX10-WGP-NEXT:    s_endpgm
4037;
4038; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
4039; GFX10-CU:       ; %bb.0: ; %entry
4040; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4041; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4042; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4043; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4044; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4045; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4046; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4047; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4048; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4049; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4050; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4051; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4052; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4053; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4054; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4055; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4056; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4057; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4058; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4059; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4060; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4061; GFX10-CU-NEXT:    s_endpgm
4062;
4063; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg:
4064; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4065; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4066; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4067; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4068; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4069; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4070; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4071; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4072; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4073; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4074; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4075; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4076; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4077; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4078; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4079; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4080; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4081; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4082; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4083; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4084; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4085; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4086; SKIP-CACHE-INV-NEXT:    s_endpgm
4087;
4088; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
4089; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4090; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4091; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4092; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4093; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4094; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4095; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4096; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4097; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4098; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4099; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4100; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4101;
4102; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
4103; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4104; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4105; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4106; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4107; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4108; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4109; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4110; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4111; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4112; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4113; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4114; GFX90A-TGSPLIT-NEXT:    s_endpgm
4115;
4116; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
4117; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4118; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4119; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4120; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4121; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4122; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4123; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4124; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4125; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4126; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4127; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4128; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4129;
4130; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
4131; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4132; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4133; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4134; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4135; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4136; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4137; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4138; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4139; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4140; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4141; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4142; GFX940-TGSPLIT-NEXT:    s_endpgm
4143;
4144; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
4145; GFX11-WGP:       ; %bb.0: ; %entry
4146; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4147; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4148; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4149; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4150; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4151; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4152; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4153; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4154; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4155; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4156; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4157; GFX11-WGP-NEXT:    s_endpgm
4158;
4159; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
4160; GFX11-CU:       ; %bb.0: ; %entry
4161; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4162; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4163; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4164; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4165; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4166; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4167; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4168; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4169; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4170; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4171; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4172; GFX11-CU-NEXT:    s_endpgm
4173;
4174; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
4175; GFX12-WGP:       ; %bb.0: ; %entry
4176; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4177; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4178; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4179; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4180; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4181; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4182; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4183; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4184; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4185; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4186; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4187; GFX12-WGP-NEXT:    s_endpgm
4188;
4189; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
4190; GFX12-CU:       ; %bb.0: ; %entry
4191; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4192; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4193; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4194; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4195; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4196; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4197; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4198; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4199; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4200; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4201; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4202; GFX12-CU-NEXT:    s_endpgm
4203    ptr %out, i32 %in, i32 %old) {
4204entry:
4205  %gep = getelementptr i32, ptr %out, i32 4
4206  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
4207  ret void
4208}
4209
4210define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
4211; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4212; GFX7:       ; %bb.0: ; %entry
4213; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4214; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4215; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4216; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4217; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4218; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4219; GFX7-NEXT:    s_mov_b32 s4, s8
4220; GFX7-NEXT:    s_mov_b32 s5, s9
4221; GFX7-NEXT:    s_mov_b32 s9, s10
4222; GFX7-NEXT:    s_mov_b32 s8, s11
4223; GFX7-NEXT:    s_add_u32 s4, s4, s9
4224; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4225; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4226; GFX7-NEXT:    s_mov_b32 s5, s8
4227; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4228; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4229; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4230; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4231; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4232; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4233; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4234; GFX7-NEXT:    s_endpgm
4235;
4236; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4237; GFX10-WGP:       ; %bb.0: ; %entry
4238; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4239; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4240; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4241; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4242; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4243; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4244; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4245; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4246; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4247; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4248; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4249; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4250; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4251; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4252; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4253; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4254; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4255; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4256; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4257; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4258; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4259; GFX10-WGP-NEXT:    s_endpgm
4260;
4261; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4262; GFX10-CU:       ; %bb.0: ; %entry
4263; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4264; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4265; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4266; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4267; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4268; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4269; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4270; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4271; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4272; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4273; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4274; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4275; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4276; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4277; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4278; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4279; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4280; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4281; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4282; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4283; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4284; GFX10-CU-NEXT:    s_endpgm
4285;
4286; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4287; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4288; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4289; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4290; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4291; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4292; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4293; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4294; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4295; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4296; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4297; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4298; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4299; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4300; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4302; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4303; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4304; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4305; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4308; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4309; SKIP-CACHE-INV-NEXT:    s_endpgm
4310;
4311; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4312; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4313; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4314; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4316; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4317; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4318; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4319; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4320; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4321; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4322; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4323; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4324;
4325; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4326; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4327; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4328; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4329; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4330; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4331; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4333; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4334; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4335; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4336; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4337; GFX90A-TGSPLIT-NEXT:    s_endpgm
4338;
4339; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4340; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4341; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4342; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4343; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4344; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4345; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4346; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4347; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4348; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4349; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4350; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4351; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4352;
4353; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4354; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4355; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4356; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4357; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4358; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4359; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4360; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4361; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4362; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4363; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4364; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4365; GFX940-TGSPLIT-NEXT:    s_endpgm
4366;
4367; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4368; GFX11-WGP:       ; %bb.0: ; %entry
4369; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4370; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4371; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4372; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4373; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4374; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4375; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4376; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4377; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4378; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4379; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4380; GFX11-WGP-NEXT:    s_endpgm
4381;
4382; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4383; GFX11-CU:       ; %bb.0: ; %entry
4384; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4385; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4386; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4387; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4388; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4389; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4390; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4391; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4392; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4393; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4394; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4395; GFX11-CU-NEXT:    s_endpgm
4396;
4397; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4398; GFX12-WGP:       ; %bb.0: ; %entry
4399; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4400; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4401; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4402; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4403; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4404; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4405; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4406; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4407; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4408; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4409; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4410; GFX12-WGP-NEXT:    s_endpgm
4411;
4412; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
4413; GFX12-CU:       ; %bb.0: ; %entry
4414; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4415; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4416; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4417; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4418; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4419; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4420; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4421; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4422; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4423; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4424; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4425; GFX12-CU-NEXT:    s_endpgm
4426    ptr %out, i32 %in, i32 %old) {
4427entry:
4428  %gep = getelementptr i32, ptr %out, i32 4
4429  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
4430  ret void
4431}
4432
4433define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
4434; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4435; GFX7:       ; %bb.0: ; %entry
4436; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4437; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4438; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4439; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4440; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4441; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4442; GFX7-NEXT:    s_mov_b32 s4, s8
4443; GFX7-NEXT:    s_mov_b32 s5, s9
4444; GFX7-NEXT:    s_mov_b32 s9, s10
4445; GFX7-NEXT:    s_mov_b32 s8, s11
4446; GFX7-NEXT:    s_add_u32 s4, s4, s9
4447; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4448; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4449; GFX7-NEXT:    s_mov_b32 s5, s8
4450; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4451; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4452; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4453; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4454; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4455; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4456; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4457; GFX7-NEXT:    s_endpgm
4458;
4459; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4460; GFX10-WGP:       ; %bb.0: ; %entry
4461; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4462; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4463; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4464; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4465; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4466; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4468; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4469; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4470; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4471; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4472; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4473; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4474; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4475; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4476; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4477; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4478; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4479; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4480; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4481; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4482; GFX10-WGP-NEXT:    s_endpgm
4483;
4484; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4485; GFX10-CU:       ; %bb.0: ; %entry
4486; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4487; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4488; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4489; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4490; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4491; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4492; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4493; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4494; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4495; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4496; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4497; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4498; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4499; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4500; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4501; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4502; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4503; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4504; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4505; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4506; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4507; GFX10-CU-NEXT:    s_endpgm
4508;
4509; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4510; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4511; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4512; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4513; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4514; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4515; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4516; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4517; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4519; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4520; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4521; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4522; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4523; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4524; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4525; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4526; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4527; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4529; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4530; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4531; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4532; SKIP-CACHE-INV-NEXT:    s_endpgm
4533;
4534; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4535; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4536; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4537; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4538; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4539; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4540; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4541; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4542; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4543; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4544; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4545; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4546; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4547;
4548; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4549; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4550; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4551; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4552; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4553; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4554; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4555; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4556; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4557; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4558; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4559; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4560; GFX90A-TGSPLIT-NEXT:    s_endpgm
4561;
4562; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4563; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4564; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4565; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4566; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4567; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4568; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4569; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4570; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4571; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4572; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4573; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4574; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4575;
4576; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4577; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4578; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4579; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4580; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4581; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4582; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4583; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4584; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4585; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4586; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4587; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4588; GFX940-TGSPLIT-NEXT:    s_endpgm
4589;
4590; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4591; GFX11-WGP:       ; %bb.0: ; %entry
4592; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4593; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4594; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4595; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4596; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4597; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4598; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4599; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4600; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4601; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4602; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4603; GFX11-WGP-NEXT:    s_endpgm
4604;
4605; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4606; GFX11-CU:       ; %bb.0: ; %entry
4607; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4608; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4609; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4610; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4611; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4612; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4613; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4614; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4615; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4616; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4617; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4618; GFX11-CU-NEXT:    s_endpgm
4619;
4620; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4621; GFX12-WGP:       ; %bb.0: ; %entry
4622; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4623; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4624; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4625; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4626; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4627; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4628; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4629; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4630; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4631; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4632; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4633; GFX12-WGP-NEXT:    s_endpgm
4634;
4635; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
4636; GFX12-CU:       ; %bb.0: ; %entry
4637; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4638; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4639; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4640; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4641; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4642; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4643; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4644; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4645; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4646; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4647; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4648; GFX12-CU-NEXT:    s_endpgm
4649    ptr %out, i32 %in, i32 %old) {
4650entry:
4651  %gep = getelementptr i32, ptr %out, i32 4
4652  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
4653  ret void
4654}
4655
4656define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
4657; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4658; GFX7:       ; %bb.0: ; %entry
4659; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4660; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4661; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4662; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4663; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4664; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4665; GFX7-NEXT:    s_mov_b32 s4, s8
4666; GFX7-NEXT:    s_mov_b32 s5, s9
4667; GFX7-NEXT:    s_mov_b32 s9, s10
4668; GFX7-NEXT:    s_mov_b32 s8, s11
4669; GFX7-NEXT:    s_add_u32 s4, s4, s9
4670; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4671; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4672; GFX7-NEXT:    s_mov_b32 s5, s8
4673; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4674; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4675; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4676; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4677; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4678; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4679; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4680; GFX7-NEXT:    s_endpgm
4681;
4682; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4683; GFX10-WGP:       ; %bb.0: ; %entry
4684; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4685; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4686; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4687; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4688; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4689; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4690; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4691; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4692; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4693; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4694; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4695; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4696; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4697; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4698; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4699; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4700; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4701; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4702; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4703; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4704; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4705; GFX10-WGP-NEXT:    s_endpgm
4706;
4707; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4708; GFX10-CU:       ; %bb.0: ; %entry
4709; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4710; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4711; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4712; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4713; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4714; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4715; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4716; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4717; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4718; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4719; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4720; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4721; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4722; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4723; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4724; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4725; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4726; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4727; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4728; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4729; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4730; GFX10-CU-NEXT:    s_endpgm
4731;
4732; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4733; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4734; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4735; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4736; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4737; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4738; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4739; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4740; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4741; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4742; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4743; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4744; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4745; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4746; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4747; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4748; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4749; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4750; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4751; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4754; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4755; SKIP-CACHE-INV-NEXT:    s_endpgm
4756;
4757; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4758; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4759; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4760; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4761; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4762; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4763; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4764; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4765; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4766; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4767; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4768; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4769; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4770;
4771; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4772; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4773; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4774; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4775; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4776; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4777; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4778; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4779; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4780; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4781; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4782; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4783; GFX90A-TGSPLIT-NEXT:    s_endpgm
4784;
4785; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4786; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4787; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4788; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4789; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4790; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4791; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4792; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4793; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4794; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4795; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4796; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4797; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4798;
4799; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4800; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4801; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4802; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4803; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4804; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4805; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4806; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
4807; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4808; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4809; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
4810; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4811; GFX940-TGSPLIT-NEXT:    s_endpgm
4812;
4813; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4814; GFX11-WGP:       ; %bb.0: ; %entry
4815; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4816; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4817; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4818; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4819; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
4820; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
4821; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4822; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
4823; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
4824; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
4825; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4826; GFX11-WGP-NEXT:    s_endpgm
4827;
4828; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4829; GFX11-CU:       ; %bb.0: ; %entry
4830; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4831; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4832; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4833; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4834; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
4835; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
4836; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4837; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
4838; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
4839; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
4840; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4841; GFX11-CU-NEXT:    s_endpgm
4842;
4843; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4844; GFX12-WGP:       ; %bb.0: ; %entry
4845; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4846; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4847; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4848; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4849; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
4850; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
4851; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4852; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
4853; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
4854; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
4855; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4856; GFX12-WGP-NEXT:    s_endpgm
4857;
4858; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
4859; GFX12-CU:       ; %bb.0: ; %entry
4860; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4861; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4862; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4863; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4864; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
4865; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
4866; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4867; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
4868; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
4869; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
4870; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4871; GFX12-CU-NEXT:    s_endpgm
4872    ptr %out, i32 %in, i32 %old) {
4873entry:
4874  %gep = getelementptr i32, ptr %out, i32 4
4875  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
4876  ret void
4877}
4878
4879define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
4880; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4881; GFX7:       ; %bb.0: ; %entry
4882; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4883; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4884; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4885; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4886; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4887; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4888; GFX7-NEXT:    s_mov_b32 s4, s8
4889; GFX7-NEXT:    s_mov_b32 s5, s9
4890; GFX7-NEXT:    s_mov_b32 s9, s10
4891; GFX7-NEXT:    s_mov_b32 s8, s11
4892; GFX7-NEXT:    s_add_u32 s4, s4, s9
4893; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4894; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4895; GFX7-NEXT:    s_mov_b32 s5, s8
4896; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4897; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4898; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4899; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4900; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4901; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4902; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4903; GFX7-NEXT:    s_endpgm
4904;
4905; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4906; GFX10-WGP:       ; %bb.0: ; %entry
4907; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
4908; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4909; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
4910; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
4911; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
4912; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4913; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
4914; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
4915; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
4916; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
4917; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
4918; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
4919; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4920; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
4921; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
4922; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
4923; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4924; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
4925; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4926; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4927; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4928; GFX10-WGP-NEXT:    s_endpgm
4929;
4930; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4931; GFX10-CU:       ; %bb.0: ; %entry
4932; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
4933; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4934; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
4935; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
4936; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
4937; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4938; GFX10-CU-NEXT:    s_mov_b32 s4, s8
4939; GFX10-CU-NEXT:    s_mov_b32 s5, s9
4940; GFX10-CU-NEXT:    s_mov_b32 s9, s10
4941; GFX10-CU-NEXT:    s_mov_b32 s8, s11
4942; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
4943; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
4944; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4945; GFX10-CU-NEXT:    s_mov_b32 s5, s8
4946; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
4947; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
4948; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4949; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
4950; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4951; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4952; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4953; GFX10-CU-NEXT:    s_endpgm
4954;
4955; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4956; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4957; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
4958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
4959; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
4960; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
4961; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
4962; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4963; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
4964; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
4965; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
4966; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
4967; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
4968; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
4969; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4970; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
4971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
4972; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4973; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4974; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
4975; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4976; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4977; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4978; SKIP-CACHE-INV-NEXT:    s_endpgm
4979;
4980; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4981; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4982; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4983; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4984; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4985; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4986; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4987; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
4988; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4989; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
4990; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4991; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4992; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4993;
4994; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
4995; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4996; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4997; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4998; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4999; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5000; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5001; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5002; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5003; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5004; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5005; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5006; GFX90A-TGSPLIT-NEXT:    s_endpgm
5007;
5008; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5009; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5010; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5011; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5012; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5013; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5014; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5015; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5016; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5017; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5018; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5019; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5020; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5021;
5022; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5023; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5024; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5025; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5026; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5027; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5028; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5029; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5030; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5031; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5032; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5033; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5034; GFX940-TGSPLIT-NEXT:    s_endpgm
5035;
5036; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5037; GFX11-WGP:       ; %bb.0: ; %entry
5038; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5039; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5040; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5041; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5042; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5043; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5044; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5045; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5046; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5047; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5048; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5049; GFX11-WGP-NEXT:    s_endpgm
5050;
5051; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5052; GFX11-CU:       ; %bb.0: ; %entry
5053; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5054; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5055; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5056; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5057; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5058; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5059; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5060; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5061; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5062; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5063; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5064; GFX11-CU-NEXT:    s_endpgm
5065;
5066; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5067; GFX12-WGP:       ; %bb.0: ; %entry
5068; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5069; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5070; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5071; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5072; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5073; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5074; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5075; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5076; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5077; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5078; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5079; GFX12-WGP-NEXT:    s_endpgm
5080;
5081; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
5082; GFX12-CU:       ; %bb.0: ; %entry
5083; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5084; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5085; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5086; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5087; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5088; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5089; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5090; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5091; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5092; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5093; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5094; GFX12-CU-NEXT:    s_endpgm
5095    ptr %out, i32 %in, i32 %old) {
5096entry:
5097  %gep = getelementptr i32, ptr %out, i32 4
5098  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
5099  ret void
5100}
5101
5102define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
5103; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5104; GFX7:       ; %bb.0: ; %entry
5105; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5106; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5107; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5108; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5109; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5110; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5111; GFX7-NEXT:    s_mov_b32 s4, s8
5112; GFX7-NEXT:    s_mov_b32 s5, s9
5113; GFX7-NEXT:    s_mov_b32 s9, s10
5114; GFX7-NEXT:    s_mov_b32 s8, s11
5115; GFX7-NEXT:    s_add_u32 s4, s4, s9
5116; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5117; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5118; GFX7-NEXT:    s_mov_b32 s5, s8
5119; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5120; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5121; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5122; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5123; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5124; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5125; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5126; GFX7-NEXT:    s_endpgm
5127;
5128; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5129; GFX10-WGP:       ; %bb.0: ; %entry
5130; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5131; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5132; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5133; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5134; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5135; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5136; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5137; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5138; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5139; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5140; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5141; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5142; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5143; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5144; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5145; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5146; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5147; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5148; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5149; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5150; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5151; GFX10-WGP-NEXT:    s_endpgm
5152;
5153; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5154; GFX10-CU:       ; %bb.0: ; %entry
5155; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5156; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5157; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5158; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5159; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5160; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5161; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5162; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5163; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5164; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5165; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5166; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5167; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5168; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5169; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5170; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5171; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5172; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5173; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5174; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5175; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5176; GFX10-CU-NEXT:    s_endpgm
5177;
5178; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5179; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5180; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5181; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5182; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5183; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5184; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5185; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5186; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5187; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5188; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5189; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5190; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5191; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5192; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5193; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5194; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5195; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5196; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5197; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5198; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5199; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5200; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5201; SKIP-CACHE-INV-NEXT:    s_endpgm
5202;
5203; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5204; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5205; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5206; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5207; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5208; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5209; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5210; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5211; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5212; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5213; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5214; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5215; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5216;
5217; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5218; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5219; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5220; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5221; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5222; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5223; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5224; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5225; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5226; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5227; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5228; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5229; GFX90A-TGSPLIT-NEXT:    s_endpgm
5230;
5231; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5232; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5233; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5234; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5235; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5236; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5237; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5238; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5239; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5240; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5241; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5242; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5243; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5244;
5245; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5246; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5247; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5248; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5249; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5250; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5251; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5252; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5253; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5254; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5255; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5256; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5257; GFX940-TGSPLIT-NEXT:    s_endpgm
5258;
5259; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5260; GFX11-WGP:       ; %bb.0: ; %entry
5261; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5262; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5263; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5264; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5265; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5266; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5267; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5268; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5269; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5270; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5271; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5272; GFX11-WGP-NEXT:    s_endpgm
5273;
5274; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5275; GFX11-CU:       ; %bb.0: ; %entry
5276; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5277; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5278; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5279; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5280; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5281; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5282; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5283; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5284; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5285; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5286; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5287; GFX11-CU-NEXT:    s_endpgm
5288;
5289; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5290; GFX12-WGP:       ; %bb.0: ; %entry
5291; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5292; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5293; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5294; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5295; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5296; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5297; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5298; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5299; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5300; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5301; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5302; GFX12-WGP-NEXT:    s_endpgm
5303;
5304; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
5305; GFX12-CU:       ; %bb.0: ; %entry
5306; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5307; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5308; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5309; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5310; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5311; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5312; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5313; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5314; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5315; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5316; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5317; GFX12-CU-NEXT:    s_endpgm
5318    ptr %out, i32 %in, i32 %old) {
5319entry:
5320  %gep = getelementptr i32, ptr %out, i32 4
5321  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
5322  ret void
5323}
5324
5325define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
5326; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5327; GFX7:       ; %bb.0: ; %entry
5328; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5329; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5330; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5331; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5332; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5333; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5334; GFX7-NEXT:    s_mov_b32 s4, s8
5335; GFX7-NEXT:    s_mov_b32 s5, s9
5336; GFX7-NEXT:    s_mov_b32 s9, s10
5337; GFX7-NEXT:    s_mov_b32 s8, s11
5338; GFX7-NEXT:    s_add_u32 s4, s4, s9
5339; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5340; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5341; GFX7-NEXT:    s_mov_b32 s5, s8
5342; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5343; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5344; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5345; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5346; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5347; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5348; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5349; GFX7-NEXT:    s_endpgm
5350;
5351; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5352; GFX10-WGP:       ; %bb.0: ; %entry
5353; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5354; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5355; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5356; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5357; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5358; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5359; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5360; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5361; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5362; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5363; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5364; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5365; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5366; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5367; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5368; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5369; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5370; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5371; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5372; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5373; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5374; GFX10-WGP-NEXT:    s_endpgm
5375;
5376; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5377; GFX10-CU:       ; %bb.0: ; %entry
5378; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5379; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5380; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5381; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5382; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5383; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5384; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5385; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5386; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5387; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5388; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5389; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5390; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5391; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5392; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5393; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5394; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5395; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5396; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5397; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5398; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5399; GFX10-CU-NEXT:    s_endpgm
5400;
5401; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5402; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5403; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5404; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5405; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5406; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5407; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5408; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5410; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5411; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5412; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5413; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5414; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5415; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5416; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5419; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5420; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5421; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5422; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5423; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5424; SKIP-CACHE-INV-NEXT:    s_endpgm
5425;
5426; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5427; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5428; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5429; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5430; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5431; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5432; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5433; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5434; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5435; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5436; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5437; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5438; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5439;
5440; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5441; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5442; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5443; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5444; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5445; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5446; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5447; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5448; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5449; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5450; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5451; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5452; GFX90A-TGSPLIT-NEXT:    s_endpgm
5453;
5454; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5455; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5456; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5457; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5458; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5459; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5460; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5461; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5462; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5463; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5464; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5465; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5466; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5467;
5468; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5469; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5470; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5471; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5472; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5473; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5474; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5475; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5476; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5477; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5478; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5479; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5480; GFX940-TGSPLIT-NEXT:    s_endpgm
5481;
5482; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5483; GFX11-WGP:       ; %bb.0: ; %entry
5484; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5485; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5486; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5487; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5488; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5489; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5490; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5491; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5492; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5493; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5494; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5495; GFX11-WGP-NEXT:    s_endpgm
5496;
5497; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5498; GFX11-CU:       ; %bb.0: ; %entry
5499; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5500; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5501; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5502; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5503; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5504; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5505; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5506; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5507; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5508; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5509; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5510; GFX11-CU-NEXT:    s_endpgm
5511;
5512; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5513; GFX12-WGP:       ; %bb.0: ; %entry
5514; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5515; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5516; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5517; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5518; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5519; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5520; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5521; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5522; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5523; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5524; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5525; GFX12-WGP-NEXT:    s_endpgm
5526;
5527; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
5528; GFX12-CU:       ; %bb.0: ; %entry
5529; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5530; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5531; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5532; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5533; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5534; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5535; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5536; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5537; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5538; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5539; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5540; GFX12-CU-NEXT:    s_endpgm
5541    ptr %out, i32 %in, i32 %old) {
5542entry:
5543  %gep = getelementptr i32, ptr %out, i32 4
5544  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
5545  ret void
5546}
5547
5548define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
5549; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5550; GFX7:       ; %bb.0: ; %entry
5551; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5552; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5553; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5554; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5555; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5556; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5557; GFX7-NEXT:    s_mov_b32 s4, s8
5558; GFX7-NEXT:    s_mov_b32 s5, s9
5559; GFX7-NEXT:    s_mov_b32 s9, s10
5560; GFX7-NEXT:    s_mov_b32 s8, s11
5561; GFX7-NEXT:    s_add_u32 s4, s4, s9
5562; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5563; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5564; GFX7-NEXT:    s_mov_b32 s5, s8
5565; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5566; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5567; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5568; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5569; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5570; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5571; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5572; GFX7-NEXT:    s_endpgm
5573;
5574; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5575; GFX10-WGP:       ; %bb.0: ; %entry
5576; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
5577; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5578; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
5579; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
5580; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
5581; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5582; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
5583; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
5584; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
5585; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
5586; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
5587; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
5588; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5589; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
5590; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
5591; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5592; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5593; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5594; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5595; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5596; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5597; GFX10-WGP-NEXT:    s_endpgm
5598;
5599; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5600; GFX10-CU:       ; %bb.0: ; %entry
5601; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
5602; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5603; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
5604; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
5605; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
5606; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5607; GFX10-CU-NEXT:    s_mov_b32 s4, s8
5608; GFX10-CU-NEXT:    s_mov_b32 s5, s9
5609; GFX10-CU-NEXT:    s_mov_b32 s9, s10
5610; GFX10-CU-NEXT:    s_mov_b32 s8, s11
5611; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
5612; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
5613; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5614; GFX10-CU-NEXT:    s_mov_b32 s5, s8
5615; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
5616; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5617; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5618; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5619; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5620; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5621; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5622; GFX10-CU-NEXT:    s_endpgm
5623;
5624; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5625; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5626; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
5627; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5628; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
5629; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
5630; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
5631; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5632; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
5633; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
5634; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
5635; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
5636; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
5637; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
5638; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5639; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
5640; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
5641; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5642; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5643; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5646; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5647; SKIP-CACHE-INV-NEXT:    s_endpgm
5648;
5649; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5650; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5651; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5652; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5653; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5654; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5655; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5656; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5657; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5658; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5659; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5660; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5661; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5662;
5663; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5664; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5665; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5666; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5667; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5668; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5669; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5670; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5671; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5672; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5673; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5674; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5675; GFX90A-TGSPLIT-NEXT:    s_endpgm
5676;
5677; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5678; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5679; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5680; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5681; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5682; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5683; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5685; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5686; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5687; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5688; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5689; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5690;
5691; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5692; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5693; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5694; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5695; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5696; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5697; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5698; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5699; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5700; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5701; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5702; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5703; GFX940-TGSPLIT-NEXT:    s_endpgm
5704;
5705; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5706; GFX11-WGP:       ; %bb.0: ; %entry
5707; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5708; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5709; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5710; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5711; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5712; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5713; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5714; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5715; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5716; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5717; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5718; GFX11-WGP-NEXT:    s_endpgm
5719;
5720; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5721; GFX11-CU:       ; %bb.0: ; %entry
5722; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5723; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5724; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5725; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5726; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5727; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5728; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5729; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5730; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5731; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5732; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5733; GFX11-CU-NEXT:    s_endpgm
5734;
5735; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5736; GFX12-WGP:       ; %bb.0: ; %entry
5737; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5738; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5739; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5740; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5741; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
5742; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
5743; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5744; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
5745; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
5746; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
5747; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5748; GFX12-WGP-NEXT:    s_endpgm
5749;
5750; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
5751; GFX12-CU:       ; %bb.0: ; %entry
5752; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5753; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5754; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5755; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5756; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
5757; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
5758; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5759; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
5760; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
5761; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
5762; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5763; GFX12-CU-NEXT:    s_endpgm
5764    ptr %out, i32 %in, i32 %old) {
5765entry:
5766  %gep = getelementptr i32, ptr %out, i32 4
5767  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
5768  ret void
5769}
5770
5771define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
5772; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5773; GFX7:       ; %bb.0: ; %entry
5774; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
5775; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5776; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
5777; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
5778; GFX7-NEXT:    s_mov_b64 s[12:13], 16
5779; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5780; GFX7-NEXT:    s_mov_b32 s6, s4
5781; GFX7-NEXT:    s_mov_b32 s7, s5
5782; GFX7-NEXT:    s_mov_b32 s11, s12
5783; GFX7-NEXT:    s_mov_b32 s10, s13
5784; GFX7-NEXT:    s_add_u32 s6, s6, s11
5785; GFX7-NEXT:    s_addc_u32 s10, s7, s10
5786; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5787; GFX7-NEXT:    s_mov_b32 s7, s10
5788; GFX7-NEXT:    v_mov_b32_e32 v2, s9
5789; GFX7-NEXT:    v_mov_b32_e32 v0, s8
5790; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5791; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5792; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5793; GFX7-NEXT:    v_mov_b32_e32 v1, s7
5794; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5795; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5796; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5797; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5798; GFX7-NEXT:    flat_store_dword v[0:1], v2
5799; GFX7-NEXT:    s_endpgm
5800;
5801; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5802; GFX10-WGP:       ; %bb.0: ; %entry
5803; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
5804; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5805; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
5806; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
5807; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
5808; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5809; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
5810; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
5811; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
5812; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
5813; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
5814; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
5815; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5816; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
5817; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
5818; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
5819; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5820; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
5821; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
5822; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5823; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5824; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5825; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5826; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5827; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5828; GFX10-WGP-NEXT:    s_endpgm
5829;
5830; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5831; GFX10-CU:       ; %bb.0: ; %entry
5832; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
5833; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5834; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
5835; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
5836; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
5837; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5838; GFX10-CU-NEXT:    s_mov_b32 s6, s4
5839; GFX10-CU-NEXT:    s_mov_b32 s7, s5
5840; GFX10-CU-NEXT:    s_mov_b32 s11, s12
5841; GFX10-CU-NEXT:    s_mov_b32 s10, s13
5842; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
5843; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
5844; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5845; GFX10-CU-NEXT:    s_mov_b32 s7, s10
5846; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
5847; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
5848; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5849; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
5850; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
5851; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5852; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5853; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5854; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5855; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5856; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5857; GFX10-CU-NEXT:    s_endpgm
5858;
5859; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5860; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5861; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5862; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5863; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5864; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5865; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
5866; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5867; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
5868; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
5869; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
5870; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
5871; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
5872; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
5873; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
5874; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
5876; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5877; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5878; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
5879; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5880; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5881; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5882; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5883; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5884; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5885; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5886; SKIP-CACHE-INV-NEXT:    s_endpgm
5887;
5888; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5889; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5890; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5891; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5892; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5893; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5894; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5895; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5896; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5897; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5898; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5899; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5900; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5901; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5902; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5903; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5904;
5905; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5906; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5907; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5908; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5909; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5910; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5911; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5912; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
5913; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5914; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5915; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5916; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5917; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5918; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5919; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
5920; GFX90A-TGSPLIT-NEXT:    s_endpgm
5921;
5922; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5923; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5924; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5925; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5926; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5927; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5928; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5929; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5930; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5931; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5932; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5933; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5934; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5935; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5936; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5937; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5938;
5939; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5940; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5941; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5942; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5943; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5944; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5945; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5946; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
5947; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5948; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
5949; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5950; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5951; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
5952; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
5953; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
5954; GFX940-TGSPLIT-NEXT:    s_endpgm
5955;
5956; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5957; GFX11-WGP:       ; %bb.0: ; %entry
5958; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5959; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5960; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5961; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5962; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
5963; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
5964; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5965; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
5966; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5967; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5968; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5969; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
5970; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
5971; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5972; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
5973; GFX11-WGP-NEXT:    s_endpgm
5974;
5975; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5976; GFX11-CU:       ; %bb.0: ; %entry
5977; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5978; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5979; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5980; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5981; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
5982; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
5983; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5984; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
5985; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5986; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5987; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5988; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
5989; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
5990; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5991; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
5992; GFX11-CU-NEXT:    s_endpgm
5993;
5994; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
5995; GFX12-WGP:       ; %bb.0: ; %entry
5996; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5997; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5998; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5999; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6000; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6001; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6002; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6003; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6004; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6005; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6006; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6007; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6008; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6009; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6010; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6011; GFX12-WGP-NEXT:    s_endpgm
6012;
6013; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
6014; GFX12-CU:       ; %bb.0: ; %entry
6015; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6016; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6017; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6018; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6019; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6020; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6021; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6022; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6023; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6024; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6025; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6026; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6027; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6028; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6029; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6030; GFX12-CU-NEXT:    s_endpgm
6031    ptr %out, i32 %in, i32 %old) {
6032entry:
6033  %gep = getelementptr i32, ptr %out, i32 4
6034  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
6035  %val0 = extractvalue { i32, i1 } %val, 0
6036  store i32 %val0, ptr %out, align 4
6037  ret void
6038}
6039
6040define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
6041; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6042; GFX7:       ; %bb.0: ; %entry
6043; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6044; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6045; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6046; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6047; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6048; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6049; GFX7-NEXT:    s_mov_b32 s6, s4
6050; GFX7-NEXT:    s_mov_b32 s7, s5
6051; GFX7-NEXT:    s_mov_b32 s11, s12
6052; GFX7-NEXT:    s_mov_b32 s10, s13
6053; GFX7-NEXT:    s_add_u32 s6, s6, s11
6054; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6055; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6056; GFX7-NEXT:    s_mov_b32 s7, s10
6057; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6058; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6059; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6060; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6061; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6062; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6063; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6064; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6065; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6066; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6067; GFX7-NEXT:    flat_store_dword v[0:1], v2
6068; GFX7-NEXT:    s_endpgm
6069;
6070; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6071; GFX10-WGP:       ; %bb.0: ; %entry
6072; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6073; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6074; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6075; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6076; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6077; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6078; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6079; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6080; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6081; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6082; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6083; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6084; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6085; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6086; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6087; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6088; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6089; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6090; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6091; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6092; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6093; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6094; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6095; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6096; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6097; GFX10-WGP-NEXT:    s_endpgm
6098;
6099; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6100; GFX10-CU:       ; %bb.0: ; %entry
6101; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6102; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6103; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6104; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6105; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6106; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6107; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6108; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6109; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6110; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6111; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6112; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6113; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6114; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6115; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6116; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6117; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6118; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6119; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6120; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6121; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6122; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6123; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6124; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6125; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6126; GFX10-CU-NEXT:    s_endpgm
6127;
6128; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6129; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6130; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6131; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6132; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6133; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6134; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6135; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6136; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6137; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6138; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6139; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6140; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6141; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6142; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6143; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6145; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6146; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6147; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6148; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6150; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6151; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6152; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6153; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6154; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6155; SKIP-CACHE-INV-NEXT:    s_endpgm
6156;
6157; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6158; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6159; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6160; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6161; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6162; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6163; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6164; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6165; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6166; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6167; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6168; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6169; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6170; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6171; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6172; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6173;
6174; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6175; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6176; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6177; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6178; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6179; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6180; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6181; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6182; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6183; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6184; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6185; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6186; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6187; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6188; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6189; GFX90A-TGSPLIT-NEXT:    s_endpgm
6190;
6191; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6192; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6193; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6194; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6195; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6196; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6197; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6198; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6199; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6200; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6201; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6202; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6203; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6204; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6205; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6206; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6207;
6208; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6209; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6210; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6211; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6212; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6213; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6214; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6215; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6216; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6217; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6218; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6219; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6220; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6221; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6222; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6223; GFX940-TGSPLIT-NEXT:    s_endpgm
6224;
6225; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6226; GFX11-WGP:       ; %bb.0: ; %entry
6227; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6228; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6229; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6230; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6231; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6232; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6233; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6234; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6235; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6236; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6237; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6238; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6239; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6240; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6241; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6242; GFX11-WGP-NEXT:    s_endpgm
6243;
6244; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6245; GFX11-CU:       ; %bb.0: ; %entry
6246; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6247; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6248; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6249; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6250; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6251; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6252; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6253; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6254; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6255; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6256; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6257; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6258; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6259; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6260; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6261; GFX11-CU-NEXT:    s_endpgm
6262;
6263; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6264; GFX12-WGP:       ; %bb.0: ; %entry
6265; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6266; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6267; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6268; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6269; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6270; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6271; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6272; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6273; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6274; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6275; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6276; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6277; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6278; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6279; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6280; GFX12-WGP-NEXT:    s_endpgm
6281;
6282; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
6283; GFX12-CU:       ; %bb.0: ; %entry
6284; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6285; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6286; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6287; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6288; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6289; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6290; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6291; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6292; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6293; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6294; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6295; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6296; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6297; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6298; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6299; GFX12-CU-NEXT:    s_endpgm
6300    ptr %out, i32 %in, i32 %old) {
6301entry:
6302  %gep = getelementptr i32, ptr %out, i32 4
6303  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
6304  %val0 = extractvalue { i32, i1 } %val, 0
6305  store i32 %val0, ptr %out, align 4
6306  ret void
6307}
6308
6309define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
6310; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6311; GFX7:       ; %bb.0: ; %entry
6312; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6313; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6314; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6315; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6316; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6317; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6318; GFX7-NEXT:    s_mov_b32 s6, s4
6319; GFX7-NEXT:    s_mov_b32 s7, s5
6320; GFX7-NEXT:    s_mov_b32 s11, s12
6321; GFX7-NEXT:    s_mov_b32 s10, s13
6322; GFX7-NEXT:    s_add_u32 s6, s6, s11
6323; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6324; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6325; GFX7-NEXT:    s_mov_b32 s7, s10
6326; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6327; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6328; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6329; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6330; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6331; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6332; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6333; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6334; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6335; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6336; GFX7-NEXT:    flat_store_dword v[0:1], v2
6337; GFX7-NEXT:    s_endpgm
6338;
6339; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6340; GFX10-WGP:       ; %bb.0: ; %entry
6341; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6342; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6343; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6344; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6345; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6346; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6347; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6348; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6349; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6350; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6351; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6352; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6353; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6354; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6355; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6356; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6357; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6358; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6359; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6360; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6361; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6362; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6363; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6364; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6365; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6366; GFX10-WGP-NEXT:    s_endpgm
6367;
6368; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6369; GFX10-CU:       ; %bb.0: ; %entry
6370; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6371; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6372; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6373; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6374; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6375; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6376; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6377; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6378; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6379; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6380; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6381; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6382; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6383; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6384; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6385; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6386; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6387; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6388; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6389; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6390; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6391; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6392; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6393; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6394; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6395; GFX10-CU-NEXT:    s_endpgm
6396;
6397; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6398; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6399; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6400; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6401; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6402; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6403; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6404; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6405; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6406; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6407; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6408; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6409; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6410; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6411; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6412; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6413; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6414; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6415; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6418; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6419; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6420; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6421; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6422; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6423; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6424; SKIP-CACHE-INV-NEXT:    s_endpgm
6425;
6426; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6427; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6428; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6429; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6430; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6431; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6432; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6433; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6434; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6435; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6436; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6437; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6438; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6439; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6440; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6441; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6442;
6443; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6444; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6445; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6446; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6447; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6448; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6449; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6450; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6451; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6452; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6453; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6454; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6455; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6456; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6457; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6458; GFX90A-TGSPLIT-NEXT:    s_endpgm
6459;
6460; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6461; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6462; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6463; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6464; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6465; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6466; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6467; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6468; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6469; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6470; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6471; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6472; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6473; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6474; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6475; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6476;
6477; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6478; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6479; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6480; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6481; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6482; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6483; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6484; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6485; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6486; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6487; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6488; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6489; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6490; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6491; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6492; GFX940-TGSPLIT-NEXT:    s_endpgm
6493;
6494; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6495; GFX11-WGP:       ; %bb.0: ; %entry
6496; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6497; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6498; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6499; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6500; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6501; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6502; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6503; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6504; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6505; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6506; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6507; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6508; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6509; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6510; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6511; GFX11-WGP-NEXT:    s_endpgm
6512;
6513; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6514; GFX11-CU:       ; %bb.0: ; %entry
6515; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6516; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6517; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6518; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6519; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6520; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6521; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6522; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6523; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6524; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6525; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6526; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6527; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6528; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6529; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6530; GFX11-CU-NEXT:    s_endpgm
6531;
6532; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6533; GFX12-WGP:       ; %bb.0: ; %entry
6534; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6535; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6536; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6537; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6538; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6539; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6540; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6541; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6542; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6543; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6544; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6545; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6546; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6547; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6548; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6549; GFX12-WGP-NEXT:    s_endpgm
6550;
6551; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
6552; GFX12-CU:       ; %bb.0: ; %entry
6553; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6554; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6555; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6556; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6557; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6558; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6559; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6560; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6561; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6562; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6563; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6564; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6565; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6566; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6567; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6568; GFX12-CU-NEXT:    s_endpgm
6569    ptr %out, i32 %in, i32 %old) {
6570entry:
6571  %gep = getelementptr i32, ptr %out, i32 4
6572  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
6573  %val0 = extractvalue { i32, i1 } %val, 0
6574  store i32 %val0, ptr %out, align 4
6575  ret void
6576}
6577
6578define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
6579; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6580; GFX7:       ; %bb.0: ; %entry
6581; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6582; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6583; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6584; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6585; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6586; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6587; GFX7-NEXT:    s_mov_b32 s6, s4
6588; GFX7-NEXT:    s_mov_b32 s7, s5
6589; GFX7-NEXT:    s_mov_b32 s11, s12
6590; GFX7-NEXT:    s_mov_b32 s10, s13
6591; GFX7-NEXT:    s_add_u32 s6, s6, s11
6592; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6593; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6594; GFX7-NEXT:    s_mov_b32 s7, s10
6595; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6596; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6597; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6598; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6599; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6600; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6601; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6602; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6603; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6604; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6605; GFX7-NEXT:    flat_store_dword v[0:1], v2
6606; GFX7-NEXT:    s_endpgm
6607;
6608; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6609; GFX10-WGP:       ; %bb.0: ; %entry
6610; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6611; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6612; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6613; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6614; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6615; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6616; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6617; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6618; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6619; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6620; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6621; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6622; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6623; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6624; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6625; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6626; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6627; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6628; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6629; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6630; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6631; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6632; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6633; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6634; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6635; GFX10-WGP-NEXT:    s_endpgm
6636;
6637; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6638; GFX10-CU:       ; %bb.0: ; %entry
6639; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6640; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6641; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6642; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6643; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6644; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6645; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6646; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6647; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6648; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6649; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6650; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6651; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6652; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6653; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6654; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6655; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6656; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6657; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6658; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6659; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6660; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6661; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6662; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6663; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6664; GFX10-CU-NEXT:    s_endpgm
6665;
6666; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6667; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6668; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6669; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6670; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6671; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6672; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6673; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6674; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6675; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6676; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6677; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6678; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6679; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6680; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6681; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6684; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6686; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6688; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6690; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6691; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6692; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6693; SKIP-CACHE-INV-NEXT:    s_endpgm
6694;
6695; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6696; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6697; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6698; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6699; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6700; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6701; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6702; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6703; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6704; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6705; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6706; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6707; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6708; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6709; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6710; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6711;
6712; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6713; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6714; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6715; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6716; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6717; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6718; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6719; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6720; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6721; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6722; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6723; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6724; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6725; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6726; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6727; GFX90A-TGSPLIT-NEXT:    s_endpgm
6728;
6729; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6730; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6731; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6732; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6733; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6734; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6735; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6736; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6737; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6738; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6739; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6740; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6741; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6742; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6743; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6744; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6745;
6746; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6747; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6748; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6749; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6750; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6751; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6752; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6753; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
6754; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6755; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6756; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6757; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6758; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
6759; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6760; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
6761; GFX940-TGSPLIT-NEXT:    s_endpgm
6762;
6763; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6764; GFX11-WGP:       ; %bb.0: ; %entry
6765; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6766; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6767; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6768; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6769; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
6770; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
6771; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6772; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
6773; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6774; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6775; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6776; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
6777; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
6778; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6779; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
6780; GFX11-WGP-NEXT:    s_endpgm
6781;
6782; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6783; GFX11-CU:       ; %bb.0: ; %entry
6784; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6785; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6786; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6787; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6788; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
6789; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
6790; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6791; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
6792; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6793; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6794; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6795; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
6796; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
6797; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6798; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
6799; GFX11-CU-NEXT:    s_endpgm
6800;
6801; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6802; GFX12-WGP:       ; %bb.0: ; %entry
6803; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6804; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6805; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6806; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6807; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
6808; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
6809; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6810; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
6811; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6812; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6813; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6814; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
6815; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
6816; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
6817; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
6818; GFX12-WGP-NEXT:    s_endpgm
6819;
6820; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
6821; GFX12-CU:       ; %bb.0: ; %entry
6822; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6823; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6824; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6825; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6826; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
6827; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
6828; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6829; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
6830; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6831; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6832; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6833; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
6834; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
6835; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
6836; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
6837; GFX12-CU-NEXT:    s_endpgm
6838    ptr %out, i32 %in, i32 %old) {
6839entry:
6840  %gep = getelementptr i32, ptr %out, i32 4
6841  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
6842  %val0 = extractvalue { i32, i1 } %val, 0
6843  store i32 %val0, ptr %out, align 4
6844  ret void
6845}
6846
6847define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
6848; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6849; GFX7:       ; %bb.0: ; %entry
6850; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6851; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6852; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6853; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6854; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6855; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6856; GFX7-NEXT:    s_mov_b32 s6, s4
6857; GFX7-NEXT:    s_mov_b32 s7, s5
6858; GFX7-NEXT:    s_mov_b32 s11, s12
6859; GFX7-NEXT:    s_mov_b32 s10, s13
6860; GFX7-NEXT:    s_add_u32 s6, s6, s11
6861; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6862; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6863; GFX7-NEXT:    s_mov_b32 s7, s10
6864; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6865; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6866; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6867; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6868; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6869; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6870; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6871; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6872; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6873; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6874; GFX7-NEXT:    flat_store_dword v[0:1], v2
6875; GFX7-NEXT:    s_endpgm
6876;
6877; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6878; GFX10-WGP:       ; %bb.0: ; %entry
6879; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
6880; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6881; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
6882; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
6883; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
6884; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6885; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
6886; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
6887; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
6888; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
6889; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
6890; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
6891; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6892; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
6893; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
6894; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
6895; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6896; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
6897; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
6898; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6899; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6900; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
6901; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
6902; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6903; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
6904; GFX10-WGP-NEXT:    s_endpgm
6905;
6906; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6907; GFX10-CU:       ; %bb.0: ; %entry
6908; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
6909; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6910; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
6911; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
6912; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
6913; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6914; GFX10-CU-NEXT:    s_mov_b32 s6, s4
6915; GFX10-CU-NEXT:    s_mov_b32 s7, s5
6916; GFX10-CU-NEXT:    s_mov_b32 s11, s12
6917; GFX10-CU-NEXT:    s_mov_b32 s10, s13
6918; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
6919; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
6920; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6921; GFX10-CU-NEXT:    s_mov_b32 s7, s10
6922; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
6923; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
6924; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6925; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
6926; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
6927; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6928; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6929; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
6930; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
6931; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6932; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
6933; GFX10-CU-NEXT:    s_endpgm
6934;
6935; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6936; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6937; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6938; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6939; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6940; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6941; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
6942; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6943; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
6944; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
6945; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
6946; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
6947; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
6948; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
6949; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6950; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6951; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
6952; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
6953; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6954; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
6955; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
6956; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
6957; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6958; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
6959; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
6960; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6961; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
6962; SKIP-CACHE-INV-NEXT:    s_endpgm
6963;
6964; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6965; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6966; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6967; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6968; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6969; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6970; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6971; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6972; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6973; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6974; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6975; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6976; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6977; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6978; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6979; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6980;
6981; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6982; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6983; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6984; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6985; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6986; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6987; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6988; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
6989; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6990; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
6991; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6992; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6993; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6994; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6995; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
6996; GFX90A-TGSPLIT-NEXT:    s_endpgm
6997;
6998; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
6999; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7000; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7001; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7002; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7003; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7004; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7005; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7006; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7007; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7008; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7009; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7010; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7011; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7012; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7013; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7014;
7015; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
7016; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7017; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7018; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7019; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7020; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7021; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7022; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7023; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7024; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7025; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7026; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7027; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7028; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7029; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7030; GFX940-TGSPLIT-NEXT:    s_endpgm
7031;
7032; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
7033; GFX11-WGP:       ; %bb.0: ; %entry
7034; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7035; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7036; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7037; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7038; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7039; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7040; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7041; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7042; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7043; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7044; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7045; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7046; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7047; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7048; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7049; GFX11-WGP-NEXT:    s_endpgm
7050;
7051; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
7052; GFX11-CU:       ; %bb.0: ; %entry
7053; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7054; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7055; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7056; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7057; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7058; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7059; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7060; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7061; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7062; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7063; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7064; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7065; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7066; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7067; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7068; GFX11-CU-NEXT:    s_endpgm
7069;
7070; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
7071; GFX12-WGP:       ; %bb.0: ; %entry
7072; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7073; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7074; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7075; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7076; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7077; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7078; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7079; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7080; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7081; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7082; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7083; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7084; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7085; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7086; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7087; GFX12-WGP-NEXT:    s_endpgm
7088;
7089; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
7090; GFX12-CU:       ; %bb.0: ; %entry
7091; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7092; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7093; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7094; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7095; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7096; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7097; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7098; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7099; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7100; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7101; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7102; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7103; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7104; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7105; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7106; GFX12-CU-NEXT:    s_endpgm
7107    ptr %out, i32 %in, i32 %old) {
7108entry:
7109  %gep = getelementptr i32, ptr %out, i32 4
7110  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
7111  %val0 = extractvalue { i32, i1 } %val, 0
7112  store i32 %val0, ptr %out, align 4
7113  ret void
7114}
7115
7116define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
7117; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7118; GFX7:       ; %bb.0: ; %entry
7119; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7120; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7121; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7122; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7123; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7124; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7125; GFX7-NEXT:    s_mov_b32 s6, s4
7126; GFX7-NEXT:    s_mov_b32 s7, s5
7127; GFX7-NEXT:    s_mov_b32 s11, s12
7128; GFX7-NEXT:    s_mov_b32 s10, s13
7129; GFX7-NEXT:    s_add_u32 s6, s6, s11
7130; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7131; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7132; GFX7-NEXT:    s_mov_b32 s7, s10
7133; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7134; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7135; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7136; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7137; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7138; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7139; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7140; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7141; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7142; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7143; GFX7-NEXT:    flat_store_dword v[0:1], v2
7144; GFX7-NEXT:    s_endpgm
7145;
7146; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7147; GFX10-WGP:       ; %bb.0: ; %entry
7148; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7149; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7150; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7151; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7152; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7153; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7154; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7155; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7156; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7157; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7158; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7159; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7160; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7161; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7162; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7163; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7164; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7165; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7166; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7167; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7168; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7169; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7170; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7171; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7172; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7173; GFX10-WGP-NEXT:    s_endpgm
7174;
7175; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7176; GFX10-CU:       ; %bb.0: ; %entry
7177; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7178; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7179; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7180; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7181; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7182; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7183; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7184; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7185; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7186; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7187; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7188; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7189; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7190; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7191; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7192; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7193; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7194; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7195; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7196; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7197; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7198; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7199; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7200; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7201; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7202; GFX10-CU-NEXT:    s_endpgm
7203;
7204; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7205; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7206; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7207; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7208; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7209; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7210; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7211; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7212; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7213; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7214; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7215; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7216; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7217; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7218; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7220; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7222; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7226; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7227; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7228; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7229; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7230; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7231; SKIP-CACHE-INV-NEXT:    s_endpgm
7232;
7233; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7234; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7235; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7236; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7237; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7238; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7239; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7240; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7241; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7242; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7243; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7244; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7245; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7246; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7247; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7248; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7249;
7250; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7251; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7252; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7253; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7254; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7255; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7256; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7258; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7259; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7260; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7261; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7262; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7263; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7264; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7265; GFX90A-TGSPLIT-NEXT:    s_endpgm
7266;
7267; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7268; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7269; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7270; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7271; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7272; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7273; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7274; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7275; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7276; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7277; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7278; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7279; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7280; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7281; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7282; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7283;
7284; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7285; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7286; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7287; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7288; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7289; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7290; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7291; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7292; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7293; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7294; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7295; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7296; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7297; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7298; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7299; GFX940-TGSPLIT-NEXT:    s_endpgm
7300;
7301; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7302; GFX11-WGP:       ; %bb.0: ; %entry
7303; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7304; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7305; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7306; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7307; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7308; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7309; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7310; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7311; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7312; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7313; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7314; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7315; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7316; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7317; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7318; GFX11-WGP-NEXT:    s_endpgm
7319;
7320; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7321; GFX11-CU:       ; %bb.0: ; %entry
7322; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7323; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7324; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7325; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7326; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7327; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7328; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7329; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7330; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7331; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7332; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7333; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7334; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7335; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7336; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7337; GFX11-CU-NEXT:    s_endpgm
7338;
7339; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7340; GFX12-WGP:       ; %bb.0: ; %entry
7341; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7342; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7343; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7344; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7345; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7346; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7347; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7348; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7349; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7350; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7351; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7352; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7353; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7354; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7355; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7356; GFX12-WGP-NEXT:    s_endpgm
7357;
7358; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
7359; GFX12-CU:       ; %bb.0: ; %entry
7360; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7361; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7362; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7363; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7364; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7365; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7366; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7367; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7368; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7369; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7370; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7371; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7372; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7373; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7374; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7375; GFX12-CU-NEXT:    s_endpgm
7376    ptr %out, i32 %in, i32 %old) {
7377entry:
7378  %gep = getelementptr i32, ptr %out, i32 4
7379  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
7380  %val0 = extractvalue { i32, i1 } %val, 0
7381  store i32 %val0, ptr %out, align 4
7382  ret void
7383}
7384
7385define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
7386; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7387; GFX7:       ; %bb.0: ; %entry
7388; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7389; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7390; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7391; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7392; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7393; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7394; GFX7-NEXT:    s_mov_b32 s6, s4
7395; GFX7-NEXT:    s_mov_b32 s7, s5
7396; GFX7-NEXT:    s_mov_b32 s11, s12
7397; GFX7-NEXT:    s_mov_b32 s10, s13
7398; GFX7-NEXT:    s_add_u32 s6, s6, s11
7399; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7400; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7401; GFX7-NEXT:    s_mov_b32 s7, s10
7402; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7403; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7404; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7405; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7406; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7407; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7408; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7409; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7410; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7411; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7412; GFX7-NEXT:    flat_store_dword v[0:1], v2
7413; GFX7-NEXT:    s_endpgm
7414;
7415; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7416; GFX10-WGP:       ; %bb.0: ; %entry
7417; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7418; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7419; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7420; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7421; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7422; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7423; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7424; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7425; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7426; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7427; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7428; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7429; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7430; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7431; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7432; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7433; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7434; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7435; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7436; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7437; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7438; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7439; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7440; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7441; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7442; GFX10-WGP-NEXT:    s_endpgm
7443;
7444; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7445; GFX10-CU:       ; %bb.0: ; %entry
7446; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7447; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7448; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7449; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7450; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7451; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7452; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7453; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7454; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7455; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7456; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7457; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7458; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7459; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7460; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7461; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7462; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7463; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7464; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7465; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7466; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7467; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7468; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7469; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7470; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7471; GFX10-CU-NEXT:    s_endpgm
7472;
7473; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7474; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7475; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7476; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7477; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7478; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7479; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7480; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7481; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7484; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7485; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7486; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7487; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7488; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7491; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7493; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7494; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7495; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7498; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7499; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7500; SKIP-CACHE-INV-NEXT:    s_endpgm
7501;
7502; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7503; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7505; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7506; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7507; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7508; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7509; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7510; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7511; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7512; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7513; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7514; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7515; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7516; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7517; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7518;
7519; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7520; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7521; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7522; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7523; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7524; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7525; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7526; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7527; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7528; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7529; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7530; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7531; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7532; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7533; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7534; GFX90A-TGSPLIT-NEXT:    s_endpgm
7535;
7536; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7537; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7538; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7539; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7540; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7541; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7542; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7543; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7544; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7545; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7546; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7547; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7548; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7549; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7550; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7551; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7552;
7553; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7554; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7555; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7556; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7557; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7558; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7559; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7560; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7561; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7562; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7563; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7564; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7565; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7566; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7567; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7568; GFX940-TGSPLIT-NEXT:    s_endpgm
7569;
7570; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7571; GFX11-WGP:       ; %bb.0: ; %entry
7572; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7573; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7574; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7575; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7576; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7577; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7578; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7579; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7580; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7581; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7582; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7583; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7584; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7585; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7586; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7587; GFX11-WGP-NEXT:    s_endpgm
7588;
7589; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7590; GFX11-CU:       ; %bb.0: ; %entry
7591; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7592; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7593; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7594; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7595; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7596; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7597; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7598; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7599; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7600; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7601; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7602; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7603; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7604; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7605; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7606; GFX11-CU-NEXT:    s_endpgm
7607;
7608; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7609; GFX12-WGP:       ; %bb.0: ; %entry
7610; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7611; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7612; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7613; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7614; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7615; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7616; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7617; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7618; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7619; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7620; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7621; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7622; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7623; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7624; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7625; GFX12-WGP-NEXT:    s_endpgm
7626;
7627; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
7628; GFX12-CU:       ; %bb.0: ; %entry
7629; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7630; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7631; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7632; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7633; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7634; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7635; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7636; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7637; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7638; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7639; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7640; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7641; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7642; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7643; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7644; GFX12-CU-NEXT:    s_endpgm
7645    ptr %out, i32 %in, i32 %old) {
7646entry:
7647  %gep = getelementptr i32, ptr %out, i32 4
7648  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
7649  %val0 = extractvalue { i32, i1 } %val, 0
7650  store i32 %val0, ptr %out, align 4
7651  ret void
7652}
7653
7654define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
7655; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7656; GFX7:       ; %bb.0: ; %entry
7657; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7658; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7659; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7660; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7661; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7662; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7663; GFX7-NEXT:    s_mov_b32 s6, s4
7664; GFX7-NEXT:    s_mov_b32 s7, s5
7665; GFX7-NEXT:    s_mov_b32 s11, s12
7666; GFX7-NEXT:    s_mov_b32 s10, s13
7667; GFX7-NEXT:    s_add_u32 s6, s6, s11
7668; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7669; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7670; GFX7-NEXT:    s_mov_b32 s7, s10
7671; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7672; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7673; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7674; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7675; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7676; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7677; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7678; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7679; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7680; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7681; GFX7-NEXT:    flat_store_dword v[0:1], v2
7682; GFX7-NEXT:    s_endpgm
7683;
7684; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7685; GFX10-WGP:       ; %bb.0: ; %entry
7686; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7687; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7688; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7689; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7690; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7691; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7692; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7693; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7694; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7695; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7696; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7697; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7698; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7699; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7700; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7701; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7702; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7703; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7704; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7705; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7706; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7707; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7708; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7709; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7710; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7711; GFX10-WGP-NEXT:    s_endpgm
7712;
7713; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7714; GFX10-CU:       ; %bb.0: ; %entry
7715; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7716; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7717; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7718; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7719; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7720; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7721; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7722; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7723; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7724; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7725; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7726; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7727; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7728; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7729; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7730; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
7731; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7732; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
7733; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
7734; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7735; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7736; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
7737; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
7738; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7739; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
7740; GFX10-CU-NEXT:    s_endpgm
7741;
7742; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7743; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7744; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7745; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7746; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7747; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7748; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
7749; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
7751; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
7752; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
7753; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
7754; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
7755; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
7756; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7757; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
7759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
7760; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7761; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
7762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
7763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
7764; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
7766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
7767; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7768; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
7769; SKIP-CACHE-INV-NEXT:    s_endpgm
7770;
7771; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7772; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7774; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7775; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7776; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7777; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7778; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7779; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7780; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7781; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7782; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7783; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7784; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7785; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7786; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7787;
7788; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7789; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7790; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7791; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7792; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7793; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7794; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7795; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
7796; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7797; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7798; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7799; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7800; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7801; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7802; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
7803; GFX90A-TGSPLIT-NEXT:    s_endpgm
7804;
7805; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7806; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7807; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7808; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7809; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7810; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7811; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7812; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7813; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7814; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7815; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7816; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7817; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7818; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7819; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7820; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7821;
7822; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7823; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7824; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7825; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7826; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7827; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7828; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7829; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
7830; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7831; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
7832; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7833; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7834; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
7835; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7836; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
7837; GFX940-TGSPLIT-NEXT:    s_endpgm
7838;
7839; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7840; GFX11-WGP:       ; %bb.0: ; %entry
7841; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7842; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7843; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7844; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7845; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
7846; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
7847; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7848; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
7849; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7850; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7851; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7852; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
7853; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
7854; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7855; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
7856; GFX11-WGP-NEXT:    s_endpgm
7857;
7858; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7859; GFX11-CU:       ; %bb.0: ; %entry
7860; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7861; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7862; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7863; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7864; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
7865; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
7866; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7867; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
7868; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7869; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7870; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7871; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
7872; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
7873; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7874; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
7875; GFX11-CU-NEXT:    s_endpgm
7876;
7877; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7878; GFX12-WGP:       ; %bb.0: ; %entry
7879; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7880; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7881; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7882; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7883; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
7884; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
7885; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7886; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
7887; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7888; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7889; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7890; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
7891; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
7892; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
7893; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
7894; GFX12-WGP-NEXT:    s_endpgm
7895;
7896; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
7897; GFX12-CU:       ; %bb.0: ; %entry
7898; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7899; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7900; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7901; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7902; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
7903; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
7904; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7905; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
7906; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7907; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7908; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7909; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
7910; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
7911; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
7912; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
7913; GFX12-CU-NEXT:    s_endpgm
7914    ptr %out, i32 %in, i32 %old) {
7915entry:
7916  %gep = getelementptr i32, ptr %out, i32 4
7917  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
7918  %val0 = extractvalue { i32, i1 } %val, 0
7919  store i32 %val0, ptr %out, align 4
7920  ret void
7921}
7922
7923define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
7924; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
7925; GFX7:       ; %bb.0: ; %entry
7926; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7927; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7928; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7929; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7930; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7931; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7932; GFX7-NEXT:    s_mov_b32 s6, s4
7933; GFX7-NEXT:    s_mov_b32 s7, s5
7934; GFX7-NEXT:    s_mov_b32 s11, s12
7935; GFX7-NEXT:    s_mov_b32 s10, s13
7936; GFX7-NEXT:    s_add_u32 s6, s6, s11
7937; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7938; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7939; GFX7-NEXT:    s_mov_b32 s7, s10
7940; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7941; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7942; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7943; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7944; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7945; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7946; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7947; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7948; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7949; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7950; GFX7-NEXT:    flat_store_dword v[0:1], v2
7951; GFX7-NEXT:    s_endpgm
7952;
7953; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
7954; GFX10-WGP:       ; %bb.0: ; %entry
7955; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
7956; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7957; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
7958; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
7959; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
7960; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7961; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
7962; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
7963; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
7964; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
7965; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
7966; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
7967; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7968; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
7969; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
7970; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
7971; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7972; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
7973; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
7974; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7975; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7976; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
7977; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
7978; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7979; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
7980; GFX10-WGP-NEXT:    s_endpgm
7981;
7982; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
7983; GFX10-CU:       ; %bb.0: ; %entry
7984; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
7985; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7986; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
7987; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
7988; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
7989; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7990; GFX10-CU-NEXT:    s_mov_b32 s6, s4
7991; GFX10-CU-NEXT:    s_mov_b32 s7, s5
7992; GFX10-CU-NEXT:    s_mov_b32 s11, s12
7993; GFX10-CU-NEXT:    s_mov_b32 s10, s13
7994; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
7995; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
7996; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7997; GFX10-CU-NEXT:    s_mov_b32 s7, s10
7998; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
7999; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8000; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8001; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8002; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8003; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8004; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8005; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8006; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8007; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8008; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8009; GFX10-CU-NEXT:    s_endpgm
8010;
8011; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8012; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8013; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8014; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8015; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8016; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8017; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8018; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8019; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8020; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8021; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8022; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8023; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8024; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8025; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8026; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8027; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8028; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8029; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8030; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8031; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8032; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8033; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8034; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8036; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8037; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8038; SKIP-CACHE-INV-NEXT:    s_endpgm
8039;
8040; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8041; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8042; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8043; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8044; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8045; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8046; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8047; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8048; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8049; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8050; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8051; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8052; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8053; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8054; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8055; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8056;
8057; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8058; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8059; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8060; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8061; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8062; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8063; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8064; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8065; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8066; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8067; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8068; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8069; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8070; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8071; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8072; GFX90A-TGSPLIT-NEXT:    s_endpgm
8073;
8074; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8075; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8076; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8077; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8078; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8079; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8080; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8081; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8082; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8083; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8084; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8085; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8086; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8087; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8088; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8089; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8090;
8091; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8092; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8093; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8094; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8095; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8096; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8097; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8098; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8099; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8100; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8101; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8102; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8103; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8104; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8105; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8106; GFX940-TGSPLIT-NEXT:    s_endpgm
8107;
8108; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8109; GFX11-WGP:       ; %bb.0: ; %entry
8110; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8111; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8112; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8113; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8114; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8115; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8116; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8117; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8118; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8119; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8120; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8121; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8122; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8123; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8124; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8125; GFX11-WGP-NEXT:    s_endpgm
8126;
8127; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8128; GFX11-CU:       ; %bb.0: ; %entry
8129; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8130; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8131; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8132; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8133; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8134; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8135; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8136; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8137; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8138; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8139; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8140; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8141; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8142; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8143; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8144; GFX11-CU-NEXT:    s_endpgm
8145;
8146; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8147; GFX12-WGP:       ; %bb.0: ; %entry
8148; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8149; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8150; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8151; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8152; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8153; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8154; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8155; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8156; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8157; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8158; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8159; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8160; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8161; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8162; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8163; GFX12-WGP-NEXT:    s_endpgm
8164;
8165; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
8166; GFX12-CU:       ; %bb.0: ; %entry
8167; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8168; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8169; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8170; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8171; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8172; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8173; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8174; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8175; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8176; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8177; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8178; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8179; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8180; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8181; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8182; GFX12-CU-NEXT:    s_endpgm
8183    ptr %out, i32 %in, i32 %old) {
8184entry:
8185  %gep = getelementptr i32, ptr %out, i32 4
8186  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
8187  %val0 = extractvalue { i32, i1 } %val, 0
8188  store i32 %val0, ptr %out, align 4
8189  ret void
8190}
8191
8192define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
8193; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8194; GFX7:       ; %bb.0: ; %entry
8195; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8196; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8197; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8198; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8199; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8200; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8201; GFX7-NEXT:    s_mov_b32 s6, s4
8202; GFX7-NEXT:    s_mov_b32 s7, s5
8203; GFX7-NEXT:    s_mov_b32 s11, s12
8204; GFX7-NEXT:    s_mov_b32 s10, s13
8205; GFX7-NEXT:    s_add_u32 s6, s6, s11
8206; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8207; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8208; GFX7-NEXT:    s_mov_b32 s7, s10
8209; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8210; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8211; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8212; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8213; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8214; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8215; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8216; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8217; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8218; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8219; GFX7-NEXT:    flat_store_dword v[0:1], v2
8220; GFX7-NEXT:    s_endpgm
8221;
8222; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8223; GFX10-WGP:       ; %bb.0: ; %entry
8224; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8225; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8226; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8227; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8228; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8229; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8230; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8231; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8232; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8233; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8234; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8235; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8236; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8237; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8238; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8240; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8241; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8242; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8243; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8244; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8245; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8246; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8247; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8248; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8249; GFX10-WGP-NEXT:    s_endpgm
8250;
8251; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8252; GFX10-CU:       ; %bb.0: ; %entry
8253; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8254; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8255; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8256; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8257; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8258; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8259; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8260; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8261; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8262; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8263; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8264; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8265; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8266; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8267; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8268; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8269; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8270; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8271; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8272; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8273; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8274; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8275; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8276; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8277; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8278; GFX10-CU-NEXT:    s_endpgm
8279;
8280; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8281; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8282; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8283; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8284; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8285; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8286; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8287; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8288; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8289; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8290; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8291; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8292; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8293; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8294; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8295; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8297; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8298; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8299; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8301; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8302; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8303; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8304; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8305; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8306; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8307; SKIP-CACHE-INV-NEXT:    s_endpgm
8308;
8309; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8310; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8311; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8312; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8313; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8314; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8315; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8316; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8317; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8318; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8319; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8320; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8321; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8322; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8323; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8324; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8325;
8326; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8327; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8328; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8329; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8330; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8331; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8332; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8333; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8334; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8335; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8336; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8337; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8338; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8339; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8340; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8341; GFX90A-TGSPLIT-NEXT:    s_endpgm
8342;
8343; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8344; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8345; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8346; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8347; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8348; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8349; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8350; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8351; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8352; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8353; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8354; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8355; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8356; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8357; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8358; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8359;
8360; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8361; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8362; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8363; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8364; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8365; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8366; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8367; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8368; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8369; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8370; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8371; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8372; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8373; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8374; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8375; GFX940-TGSPLIT-NEXT:    s_endpgm
8376;
8377; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8378; GFX11-WGP:       ; %bb.0: ; %entry
8379; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8380; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8381; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8382; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8383; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8384; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8385; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8386; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8387; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8388; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8389; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8390; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8391; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8392; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8393; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8394; GFX11-WGP-NEXT:    s_endpgm
8395;
8396; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8397; GFX11-CU:       ; %bb.0: ; %entry
8398; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8399; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8400; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8401; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8402; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8403; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8404; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8405; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8406; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8407; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8408; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8409; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8410; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8411; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8412; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8413; GFX11-CU-NEXT:    s_endpgm
8414;
8415; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8416; GFX12-WGP:       ; %bb.0: ; %entry
8417; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8418; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8419; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8420; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8421; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8422; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8423; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8424; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8425; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8426; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8427; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8428; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8429; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8430; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8431; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8432; GFX12-WGP-NEXT:    s_endpgm
8433;
8434; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
8435; GFX12-CU:       ; %bb.0: ; %entry
8436; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8437; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8438; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8439; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8440; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8441; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8442; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8443; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8444; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8445; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8446; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8447; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8448; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8449; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8450; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8451; GFX12-CU-NEXT:    s_endpgm
8452    ptr %out, i32 %in, i32 %old) {
8453entry:
8454  %gep = getelementptr i32, ptr %out, i32 4
8455  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
8456  %val0 = extractvalue { i32, i1 } %val, 0
8457  store i32 %val0, ptr %out, align 4
8458  ret void
8459}
8460
8461define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
8462; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8463; GFX7:       ; %bb.0: ; %entry
8464; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8465; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8466; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8467; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8468; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8469; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8470; GFX7-NEXT:    s_mov_b32 s6, s4
8471; GFX7-NEXT:    s_mov_b32 s7, s5
8472; GFX7-NEXT:    s_mov_b32 s11, s12
8473; GFX7-NEXT:    s_mov_b32 s10, s13
8474; GFX7-NEXT:    s_add_u32 s6, s6, s11
8475; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8476; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8477; GFX7-NEXT:    s_mov_b32 s7, s10
8478; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8479; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8480; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8481; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8482; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8483; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8484; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8485; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8486; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8487; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8488; GFX7-NEXT:    flat_store_dword v[0:1], v2
8489; GFX7-NEXT:    s_endpgm
8490;
8491; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8492; GFX10-WGP:       ; %bb.0: ; %entry
8493; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8494; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8495; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8496; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8497; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8498; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8499; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8500; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8501; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8502; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8503; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8504; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8505; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8506; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8507; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8508; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8509; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8510; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8512; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8513; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8514; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8515; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8516; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8517; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8518; GFX10-WGP-NEXT:    s_endpgm
8519;
8520; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8521; GFX10-CU:       ; %bb.0: ; %entry
8522; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8523; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8524; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8525; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8526; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8527; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8528; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8529; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8530; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8531; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8532; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8533; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8534; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8535; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8536; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8537; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8538; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8539; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8540; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8541; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8542; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8543; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8544; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8545; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8546; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8547; GFX10-CU-NEXT:    s_endpgm
8548;
8549; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8550; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8551; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8552; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8553; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8554; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8555; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8556; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8557; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8558; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8559; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8560; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8561; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8562; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8563; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8564; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8566; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8567; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8568; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8569; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8571; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8572; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8573; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8574; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8575; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8576; SKIP-CACHE-INV-NEXT:    s_endpgm
8577;
8578; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8579; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8580; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8581; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8582; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8583; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8584; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8585; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8586; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8587; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8588; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8589; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8590; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8591; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8592; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8593; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8594;
8595; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8596; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8597; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8598; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8599; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8600; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8601; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8602; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8603; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8604; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8605; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8606; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8607; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8608; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8609; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8610; GFX90A-TGSPLIT-NEXT:    s_endpgm
8611;
8612; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8613; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8614; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8615; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8616; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8617; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8618; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8619; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8620; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8621; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8622; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8623; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8624; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8625; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8626; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8627; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8628;
8629; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8630; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8631; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8632; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8633; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8634; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8635; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8636; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8637; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8638; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8639; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8640; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8641; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8642; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8643; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8644; GFX940-TGSPLIT-NEXT:    s_endpgm
8645;
8646; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8647; GFX11-WGP:       ; %bb.0: ; %entry
8648; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8649; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8650; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8651; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8652; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8653; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8654; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8655; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8656; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8657; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8658; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8659; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8660; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8661; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8662; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8663; GFX11-WGP-NEXT:    s_endpgm
8664;
8665; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8666; GFX11-CU:       ; %bb.0: ; %entry
8667; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8668; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8669; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8670; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8671; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8672; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8673; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8674; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8675; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8676; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8677; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8678; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8679; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8680; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8681; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8682; GFX11-CU-NEXT:    s_endpgm
8683;
8684; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8685; GFX12-WGP:       ; %bb.0: ; %entry
8686; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8687; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8688; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8689; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8690; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8691; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8692; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8693; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8694; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8695; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8696; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8697; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8698; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8699; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8700; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8701; GFX12-WGP-NEXT:    s_endpgm
8702;
8703; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
8704; GFX12-CU:       ; %bb.0: ; %entry
8705; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8706; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8707; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8708; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8709; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8710; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8711; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8712; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8713; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8714; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8715; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8716; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8717; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8718; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8719; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8720; GFX12-CU-NEXT:    s_endpgm
8721    ptr %out, i32 %in, i32 %old) {
8722entry:
8723  %gep = getelementptr i32, ptr %out, i32 4
8724  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
8725  %val0 = extractvalue { i32, i1 } %val, 0
8726  store i32 %val0, ptr %out, align 4
8727  ret void
8728}
8729
8730define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
8731; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8732; GFX7:       ; %bb.0: ; %entry
8733; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8734; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8735; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8736; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8737; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8738; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8739; GFX7-NEXT:    s_mov_b32 s6, s4
8740; GFX7-NEXT:    s_mov_b32 s7, s5
8741; GFX7-NEXT:    s_mov_b32 s11, s12
8742; GFX7-NEXT:    s_mov_b32 s10, s13
8743; GFX7-NEXT:    s_add_u32 s6, s6, s11
8744; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8745; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8746; GFX7-NEXT:    s_mov_b32 s7, s10
8747; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8748; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8749; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8750; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8751; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8752; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8753; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8754; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8755; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8756; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8757; GFX7-NEXT:    flat_store_dword v[0:1], v2
8758; GFX7-NEXT:    s_endpgm
8759;
8760; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8761; GFX10-WGP:       ; %bb.0: ; %entry
8762; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
8763; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8764; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
8765; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
8766; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
8767; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8768; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
8769; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
8770; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
8771; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
8772; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
8773; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
8774; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8775; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
8776; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
8777; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
8778; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8779; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
8780; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
8781; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8782; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8783; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
8784; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
8785; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8786; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
8787; GFX10-WGP-NEXT:    s_endpgm
8788;
8789; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8790; GFX10-CU:       ; %bb.0: ; %entry
8791; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
8792; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8793; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
8794; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
8795; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
8796; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8797; GFX10-CU-NEXT:    s_mov_b32 s6, s4
8798; GFX10-CU-NEXT:    s_mov_b32 s7, s5
8799; GFX10-CU-NEXT:    s_mov_b32 s11, s12
8800; GFX10-CU-NEXT:    s_mov_b32 s10, s13
8801; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
8802; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
8803; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8804; GFX10-CU-NEXT:    s_mov_b32 s7, s10
8805; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
8806; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
8807; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8808; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
8809; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
8810; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8811; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8812; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
8813; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
8814; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8815; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
8816; GFX10-CU-NEXT:    s_endpgm
8817;
8818; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8819; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8820; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8821; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8822; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8823; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8824; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
8825; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8826; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
8827; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
8828; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
8829; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
8830; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
8831; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
8832; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8833; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8834; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
8835; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
8836; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8837; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
8838; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
8839; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
8840; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8841; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
8842; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
8843; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8844; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
8845; SKIP-CACHE-INV-NEXT:    s_endpgm
8846;
8847; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8848; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8849; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8850; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8851; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8852; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8853; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8854; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8855; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8856; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8857; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8858; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8859; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8860; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8861; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8862; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8863;
8864; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8865; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8866; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8867; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8868; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8869; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8870; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8871; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
8872; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8873; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8874; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8875; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8876; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8877; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8878; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
8879; GFX90A-TGSPLIT-NEXT:    s_endpgm
8880;
8881; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8882; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8883; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8884; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8885; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8886; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8887; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8888; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8889; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8890; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8891; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8892; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8893; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8894; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8895; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8896; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8897;
8898; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8899; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8900; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8901; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8902; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8903; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8904; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8905; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
8906; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8907; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
8908; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8909; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8910; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
8911; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8912; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
8913; GFX940-TGSPLIT-NEXT:    s_endpgm
8914;
8915; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8916; GFX11-WGP:       ; %bb.0: ; %entry
8917; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8918; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8919; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8920; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8921; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
8922; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
8923; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8924; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
8925; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8926; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8927; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8928; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
8929; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
8930; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8931; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
8932; GFX11-WGP-NEXT:    s_endpgm
8933;
8934; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8935; GFX11-CU:       ; %bb.0: ; %entry
8936; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8937; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8938; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8939; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8940; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
8941; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
8942; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8943; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
8944; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8945; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8946; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8947; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
8948; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
8949; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8950; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
8951; GFX11-CU-NEXT:    s_endpgm
8952;
8953; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8954; GFX12-WGP:       ; %bb.0: ; %entry
8955; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8956; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8957; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8958; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8959; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
8960; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
8961; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8962; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
8963; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8964; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8965; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8966; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
8967; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
8968; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
8969; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
8970; GFX12-WGP-NEXT:    s_endpgm
8971;
8972; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
8973; GFX12-CU:       ; %bb.0: ; %entry
8974; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8975; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8976; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8977; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8978; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
8979; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
8980; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8981; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
8982; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8983; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8984; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8985; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
8986; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
8987; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
8988; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
8989; GFX12-CU-NEXT:    s_endpgm
8990    ptr %out, i32 %in, i32 %old) {
8991entry:
8992  %gep = getelementptr i32, ptr %out, i32 4
8993  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
8994  %val0 = extractvalue { i32, i1 } %val, 0
8995  store i32 %val0, ptr %out, align 4
8996  ret void
8997}
8998
8999define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
9000; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9001; GFX7:       ; %bb.0: ; %entry
9002; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9003; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9004; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9005; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9006; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9007; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9008; GFX7-NEXT:    s_mov_b32 s6, s4
9009; GFX7-NEXT:    s_mov_b32 s7, s5
9010; GFX7-NEXT:    s_mov_b32 s11, s12
9011; GFX7-NEXT:    s_mov_b32 s10, s13
9012; GFX7-NEXT:    s_add_u32 s6, s6, s11
9013; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9014; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9015; GFX7-NEXT:    s_mov_b32 s7, s10
9016; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9017; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9018; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9019; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9020; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9021; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9022; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9023; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9024; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9025; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9026; GFX7-NEXT:    flat_store_dword v[0:1], v2
9027; GFX7-NEXT:    s_endpgm
9028;
9029; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9030; GFX10-WGP:       ; %bb.0: ; %entry
9031; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9032; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9033; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9034; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9035; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9036; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9037; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9038; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9039; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9040; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9041; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9042; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9043; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9044; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9045; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9046; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9047; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9048; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9049; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9050; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9051; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9052; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9053; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9054; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9055; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9056; GFX10-WGP-NEXT:    s_endpgm
9057;
9058; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9059; GFX10-CU:       ; %bb.0: ; %entry
9060; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9061; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9062; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9063; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9064; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9065; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9066; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9067; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9068; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9069; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9070; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9071; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9072; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9073; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9074; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9075; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9076; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9077; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9079; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9080; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9081; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9082; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9083; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9084; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9085; GFX10-CU-NEXT:    s_endpgm
9086;
9087; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9088; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9089; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9090; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9091; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9092; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9093; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9094; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9095; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9096; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9097; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9098; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9099; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9100; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9101; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9102; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9103; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9104; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9105; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9106; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9107; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9108; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9109; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9111; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9112; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9113; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9114; SKIP-CACHE-INV-NEXT:    s_endpgm
9115;
9116; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9117; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9118; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9119; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9120; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9121; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9122; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9123; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9124; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9125; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9126; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9127; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9128; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9129; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9130; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9131; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9132;
9133; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9134; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9135; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9136; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9137; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9138; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9139; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9140; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9141; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9142; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9143; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9144; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9145; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9146; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9147; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9148; GFX90A-TGSPLIT-NEXT:    s_endpgm
9149;
9150; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9151; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9152; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9153; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9154; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9155; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9156; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9157; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9158; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9159; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9160; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9161; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9162; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9163; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9164; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9165; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9166;
9167; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9168; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9169; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9170; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9171; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9172; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9173; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9174; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9175; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9176; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9177; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9178; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9179; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9180; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9181; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9182; GFX940-TGSPLIT-NEXT:    s_endpgm
9183;
9184; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9185; GFX11-WGP:       ; %bb.0: ; %entry
9186; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9187; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9188; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9189; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9190; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9191; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9192; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9193; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9194; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9195; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9196; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9197; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9198; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9199; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9200; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9201; GFX11-WGP-NEXT:    s_endpgm
9202;
9203; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9204; GFX11-CU:       ; %bb.0: ; %entry
9205; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9206; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9207; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9208; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9209; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9210; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9211; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9212; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9213; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9214; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9215; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9216; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9217; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9218; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9219; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9220; GFX11-CU-NEXT:    s_endpgm
9221;
9222; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9223; GFX12-WGP:       ; %bb.0: ; %entry
9224; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9225; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9226; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9227; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9228; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9229; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9230; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9231; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9232; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9233; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9234; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9235; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9236; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9237; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9238; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9239; GFX12-WGP-NEXT:    s_endpgm
9240;
9241; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
9242; GFX12-CU:       ; %bb.0: ; %entry
9243; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9244; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9245; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9246; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9247; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9248; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9249; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9250; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9251; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9252; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9253; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9254; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9255; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9256; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9257; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9258; GFX12-CU-NEXT:    s_endpgm
9259    ptr %out, i32 %in, i32 %old) {
9260entry:
9261  %gep = getelementptr i32, ptr %out, i32 4
9262  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
9263  %val0 = extractvalue { i32, i1 } %val, 0
9264  store i32 %val0, ptr %out, align 4
9265  ret void
9266}
9267
9268define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
9269; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9270; GFX7:       ; %bb.0: ; %entry
9271; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9272; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9273; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9274; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9275; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9276; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9277; GFX7-NEXT:    s_mov_b32 s6, s4
9278; GFX7-NEXT:    s_mov_b32 s7, s5
9279; GFX7-NEXT:    s_mov_b32 s11, s12
9280; GFX7-NEXT:    s_mov_b32 s10, s13
9281; GFX7-NEXT:    s_add_u32 s6, s6, s11
9282; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9283; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9284; GFX7-NEXT:    s_mov_b32 s7, s10
9285; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9286; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9287; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9288; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9289; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9290; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9291; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9292; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9293; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9294; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9295; GFX7-NEXT:    flat_store_dword v[0:1], v2
9296; GFX7-NEXT:    s_endpgm
9297;
9298; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9299; GFX10-WGP:       ; %bb.0: ; %entry
9300; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9301; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9302; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9303; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9304; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9305; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9306; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9307; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9308; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9309; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9310; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9311; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9312; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9313; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9314; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9315; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9316; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9317; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9318; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9319; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9320; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9321; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9322; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9323; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9324; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9325; GFX10-WGP-NEXT:    s_endpgm
9326;
9327; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9328; GFX10-CU:       ; %bb.0: ; %entry
9329; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9330; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9331; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9332; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9333; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9334; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9335; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9336; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9337; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9338; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9339; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9340; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9341; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9342; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9343; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9344; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9345; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9346; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9347; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9348; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9349; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9350; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9351; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9352; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9353; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9354; GFX10-CU-NEXT:    s_endpgm
9355;
9356; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9357; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9358; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9359; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9360; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9361; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9362; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9363; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9364; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9365; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9366; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9368; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9369; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9370; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9371; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9372; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9374; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9375; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9378; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9379; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9380; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9381; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9382; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9383; SKIP-CACHE-INV-NEXT:    s_endpgm
9384;
9385; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9386; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9387; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9388; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9389; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9390; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9391; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9392; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9393; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9394; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9395; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9396; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9397; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9398; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9399; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9400; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9401;
9402; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9403; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9404; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9405; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9406; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9407; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9408; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9409; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9410; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9411; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9412; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9413; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9414; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9415; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9416; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9417; GFX90A-TGSPLIT-NEXT:    s_endpgm
9418;
9419; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9420; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9421; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9422; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9423; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9424; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9425; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9426; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9427; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9428; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9429; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9430; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9431; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9432; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9433; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9434; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9435;
9436; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9437; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9438; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9439; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9440; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9441; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9442; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9443; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9444; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9445; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9446; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9447; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9448; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9449; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9450; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9451; GFX940-TGSPLIT-NEXT:    s_endpgm
9452;
9453; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9454; GFX11-WGP:       ; %bb.0: ; %entry
9455; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9456; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9457; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9458; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9459; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9460; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9461; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9462; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9463; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9464; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9465; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9466; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9467; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9468; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9469; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9470; GFX11-WGP-NEXT:    s_endpgm
9471;
9472; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9473; GFX11-CU:       ; %bb.0: ; %entry
9474; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9475; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9476; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9477; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9478; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9479; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9480; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9481; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9482; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9483; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9484; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9485; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9486; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9487; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9488; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9489; GFX11-CU-NEXT:    s_endpgm
9490;
9491; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9492; GFX12-WGP:       ; %bb.0: ; %entry
9493; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9494; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9495; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9496; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9497; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9498; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9499; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9500; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9501; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9502; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9503; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9504; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9505; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9506; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9507; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9508; GFX12-WGP-NEXT:    s_endpgm
9509;
9510; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9511; GFX12-CU:       ; %bb.0: ; %entry
9512; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9513; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9514; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9515; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9516; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9517; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9518; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9519; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9520; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9521; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9522; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9523; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9524; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9525; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9526; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9527; GFX12-CU-NEXT:    s_endpgm
9528    ptr %out, i32 %in, i32 %old) {
9529entry:
9530  %gep = getelementptr i32, ptr %out, i32 4
9531  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
9532  %val0 = extractvalue { i32, i1 } %val, 0
9533  store i32 %val0, ptr %out, align 4
9534  ret void
9535}
9536
9537define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
9538; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9539; GFX7:       ; %bb.0: ; %entry
9540; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9541; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9542; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9543; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9544; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9545; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9546; GFX7-NEXT:    s_mov_b32 s6, s4
9547; GFX7-NEXT:    s_mov_b32 s7, s5
9548; GFX7-NEXT:    s_mov_b32 s11, s12
9549; GFX7-NEXT:    s_mov_b32 s10, s13
9550; GFX7-NEXT:    s_add_u32 s6, s6, s11
9551; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9552; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9553; GFX7-NEXT:    s_mov_b32 s7, s10
9554; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9555; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9556; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9557; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9558; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9559; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9560; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9561; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9562; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9563; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9564; GFX7-NEXT:    flat_store_dword v[0:1], v2
9565; GFX7-NEXT:    s_endpgm
9566;
9567; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9568; GFX10-WGP:       ; %bb.0: ; %entry
9569; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
9570; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9571; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
9572; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
9573; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
9574; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9575; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
9576; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
9577; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
9578; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
9579; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
9580; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
9581; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9582; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
9583; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
9584; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
9585; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9586; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
9587; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9588; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9589; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9590; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9591; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9592; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9593; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9594; GFX10-WGP-NEXT:    s_endpgm
9595;
9596; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9597; GFX10-CU:       ; %bb.0: ; %entry
9598; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
9599; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9600; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
9601; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
9602; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
9603; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9604; GFX10-CU-NEXT:    s_mov_b32 s6, s4
9605; GFX10-CU-NEXT:    s_mov_b32 s7, s5
9606; GFX10-CU-NEXT:    s_mov_b32 s11, s12
9607; GFX10-CU-NEXT:    s_mov_b32 s10, s13
9608; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
9609; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
9610; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9611; GFX10-CU-NEXT:    s_mov_b32 s7, s10
9612; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
9613; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
9614; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9615; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
9616; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9617; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9618; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9619; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9620; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9621; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9622; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9623; GFX10-CU-NEXT:    s_endpgm
9624;
9625; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9626; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9627; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9628; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9629; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9630; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9631; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
9632; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9633; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
9634; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
9635; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9636; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9637; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
9638; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
9639; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9640; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9641; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
9642; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
9643; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9644; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
9645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9647; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9649; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9650; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9651; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9652; SKIP-CACHE-INV-NEXT:    s_endpgm
9653;
9654; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9655; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9657; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9658; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9659; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9660; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9661; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9662; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9663; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9664; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9665; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9666; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9667; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9668; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9669; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9670;
9671; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9672; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9673; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9674; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9675; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9676; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9677; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9678; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
9679; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9680; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9681; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9682; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9683; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9684; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9685; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9686; GFX90A-TGSPLIT-NEXT:    s_endpgm
9687;
9688; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9689; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9690; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9691; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9692; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9693; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9694; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9695; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9696; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9697; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9698; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9699; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9700; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9701; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9702; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9703; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9704;
9705; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9706; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9707; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9708; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9709; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9710; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9711; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9712; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
9713; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9714; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
9715; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9716; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9717; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9718; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9719; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9720; GFX940-TGSPLIT-NEXT:    s_endpgm
9721;
9722; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9723; GFX11-WGP:       ; %bb.0: ; %entry
9724; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9725; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9726; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9727; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9728; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
9729; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9730; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9731; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
9732; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9733; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9734; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9735; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9736; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9737; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9738; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9739; GFX11-WGP-NEXT:    s_endpgm
9740;
9741; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9742; GFX11-CU:       ; %bb.0: ; %entry
9743; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9744; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9745; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9746; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9747; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
9748; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9749; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9750; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
9751; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9752; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9753; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9754; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9755; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9756; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9757; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9758; GFX11-CU-NEXT:    s_endpgm
9759;
9760; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9761; GFX12-WGP:       ; %bb.0: ; %entry
9762; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9763; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9764; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9765; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9766; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
9767; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9768; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9769; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
9770; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9771; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9772; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9773; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9774; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9775; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9776; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9777; GFX12-WGP-NEXT:    s_endpgm
9778;
9779; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9780; GFX12-CU:       ; %bb.0: ; %entry
9781; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9782; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9783; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9784; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9785; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
9786; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9787; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9788; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
9789; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9790; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9791; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9792; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9793; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9794; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9795; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9796; GFX12-CU-NEXT:    s_endpgm
9797    ptr %out, i32 %in, i32 %old) {
9798entry:
9799  %gep = getelementptr i32, ptr %out, i32 4
9800  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
9801  %val0 = extractvalue { i32, i1 } %val, 0
9802  store i32 %val0, ptr %out, align 4
9803  ret void
9804}
9805
9806define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
9807; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
9808; GFX7:       ; %bb.0: ; %entry
9809; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9810; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9811; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9812; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9813; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9814; GFX7-NEXT:    flat_load_dword v2, v[0:1]
9815; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9816; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9817; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9818; GFX7-NEXT:    flat_store_dword v[0:1], v2
9819; GFX7-NEXT:    s_endpgm
9820;
9821; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
9822; GFX10-WGP:       ; %bb.0: ; %entry
9823; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9824; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9825; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9826; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9827; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9828; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
9829; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9830; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9831; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9832; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
9833; GFX10-WGP-NEXT:    s_endpgm
9834;
9835; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
9836; GFX10-CU:       ; %bb.0: ; %entry
9837; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9838; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9839; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9840; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
9841; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9842; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
9843; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
9844; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
9845; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9846; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
9847; GFX10-CU-NEXT:    s_endpgm
9848;
9849; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load:
9850; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9851; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9852; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
9853; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9854; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
9855; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
9856; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
9857; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
9858; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
9859; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9860; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
9861; SKIP-CACHE-INV-NEXT:    s_endpgm
9862;
9863; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
9864; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9865; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9866; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9867; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9868; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9869; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9870; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9871; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9872; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9873; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9874;
9875; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
9876; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9877; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9878; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9879; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9880; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9881; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9882; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9883; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9884; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
9885; GFX90A-TGSPLIT-NEXT:    s_endpgm
9886;
9887; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
9888; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9889; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9890; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9891; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9892; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9893; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9894; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9895; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9896; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9897; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9898;
9899; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
9900; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9901; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9902; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9903; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9904; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
9905; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
9906; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
9907; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9908; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
9909; GFX940-TGSPLIT-NEXT:    s_endpgm
9910;
9911; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_load:
9912; GFX11-WGP:       ; %bb.0: ; %entry
9913; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9914; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9915; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9916; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
9917; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
9918; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
9919; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
9920; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
9921; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9922; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
9923; GFX11-WGP-NEXT:    s_endpgm
9924;
9925; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load:
9926; GFX11-CU:       ; %bb.0: ; %entry
9927; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9928; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9929; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9930; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
9931; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
9932; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
9933; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
9934; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
9935; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9936; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
9937; GFX11-CU-NEXT:    s_endpgm
9938;
9939; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load:
9940; GFX12-WGP:       ; %bb.0: ; %entry
9941; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9942; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9943; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9944; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
9945; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
9946; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
9947; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
9948; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
9949; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
9950; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
9951; GFX12-WGP-NEXT:    s_endpgm
9952;
9953; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load:
9954; GFX12-CU:       ; %bb.0: ; %entry
9955; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9956; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9957; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9958; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
9959; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
9960; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
9961; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
9962; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
9963; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
9964; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
9965; GFX12-CU-NEXT:    s_endpgm
9966    ptr %in, ptr %out) {
9967entry:
9968  %val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4
9969  store i32 %val, ptr %out
9970  ret void
9971}
9972
9973define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
9974; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
9975; GFX7:       ; %bb.0: ; %entry
9976; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9977; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9978; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9979; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9980; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9981; GFX7-NEXT:    flat_load_dword v2, v[0:1]
9982; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9983; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9984; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9985; GFX7-NEXT:    flat_store_dword v[0:1], v2
9986; GFX7-NEXT:    s_endpgm
9987;
9988; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
9989; GFX10-WGP:       ; %bb.0: ; %entry
9990; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9991; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9992; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9993; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
9994; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9995; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
9996; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
9997; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
9998; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9999; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10000; GFX10-WGP-NEXT:    s_endpgm
10001;
10002; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
10003; GFX10-CU:       ; %bb.0: ; %entry
10004; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10005; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10006; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10007; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10008; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10009; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10010; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10011; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10012; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10013; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10014; GFX10-CU-NEXT:    s_endpgm
10015;
10016; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load:
10017; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10018; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10019; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10020; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10021; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10022; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10023; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10024; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10026; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10027; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10028; SKIP-CACHE-INV-NEXT:    s_endpgm
10029;
10030; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
10031; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10032; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10033; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10034; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10035; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10036; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10037; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10038; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10039; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10040; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10041;
10042; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
10043; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10044; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10045; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10046; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10047; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10048; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10049; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10050; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10051; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10052; GFX90A-TGSPLIT-NEXT:    s_endpgm
10053;
10054; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
10055; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10056; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10057; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10058; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10059; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10060; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10061; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10062; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10063; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10064; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10065;
10066; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
10067; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10068; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10069; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10070; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10071; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10072; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10073; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10074; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10075; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10076; GFX940-TGSPLIT-NEXT:    s_endpgm
10077;
10078; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
10079; GFX11-WGP:       ; %bb.0: ; %entry
10080; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10081; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10082; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10083; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10084; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10085; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10086; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10087; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10088; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10089; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10090; GFX11-WGP-NEXT:    s_endpgm
10091;
10092; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load:
10093; GFX11-CU:       ; %bb.0: ; %entry
10094; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10095; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10096; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10097; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10098; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10099; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10100; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10101; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10102; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10103; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10104; GFX11-CU-NEXT:    s_endpgm
10105;
10106; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
10107; GFX12-WGP:       ; %bb.0: ; %entry
10108; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10109; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10110; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10111; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10112; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10113; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10114; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10115; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10116; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10117; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10118; GFX12-WGP-NEXT:    s_endpgm
10119;
10120; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load:
10121; GFX12-CU:       ; %bb.0: ; %entry
10122; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10123; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10124; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10125; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10126; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10127; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10128; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10129; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10130; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10131; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10132; GFX12-CU-NEXT:    s_endpgm
10133    ptr %in, ptr %out) {
10134entry:
10135  %val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4
10136  store i32 %val, ptr %out
10137  ret void
10138}
10139
10140define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
10141; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
10142; GFX7:       ; %bb.0: ; %entry
10143; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10144; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
10145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10146; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10147; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10148; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10149; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10150; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10151; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10152; GFX7-NEXT:    flat_store_dword v[0:1], v2
10153; GFX7-NEXT:    s_endpgm
10154;
10155; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
10156; GFX10-WGP:       ; %bb.0: ; %entry
10157; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10158; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10159; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10160; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10161; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10162; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
10163; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10164; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10165; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10166; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10167; GFX10-WGP-NEXT:    s_endpgm
10168;
10169; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
10170; GFX10-CU:       ; %bb.0: ; %entry
10171; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10172; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10173; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10174; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10175; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10176; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10177; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10178; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10179; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10180; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10181; GFX10-CU-NEXT:    s_endpgm
10182;
10183; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load:
10184; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10185; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10186; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10187; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10188; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10189; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10190; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10191; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10192; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10193; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10194; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10195; SKIP-CACHE-INV-NEXT:    s_endpgm
10196;
10197; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
10198; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10199; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10200; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10201; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10202; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10203; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10204; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10205; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10206; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10207; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10208;
10209; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
10210; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10211; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10212; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10213; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10214; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10215; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10216; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10217; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10218; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10219; GFX90A-TGSPLIT-NEXT:    s_endpgm
10220;
10221; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
10222; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10223; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10224; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10225; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10226; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10227; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10228; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10229; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10230; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10231; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10232;
10233; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
10234; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10235; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10236; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10237; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10238; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10239; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10240; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10241; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10242; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10243; GFX940-TGSPLIT-NEXT:    s_endpgm
10244;
10245; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_load:
10246; GFX11-WGP:       ; %bb.0: ; %entry
10247; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10248; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10249; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10250; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10251; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10252; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10253; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10254; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10255; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10256; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10257; GFX11-WGP-NEXT:    s_endpgm
10258;
10259; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load:
10260; GFX11-CU:       ; %bb.0: ; %entry
10261; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10262; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10263; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10264; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10265; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10266; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10267; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10268; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10269; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10270; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10271; GFX11-CU-NEXT:    s_endpgm
10272;
10273; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load:
10274; GFX12-WGP:       ; %bb.0: ; %entry
10275; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10276; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10277; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10278; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10279; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10280; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10281; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10282; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10283; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10284; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10285; GFX12-WGP-NEXT:    s_endpgm
10286;
10287; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load:
10288; GFX12-CU:       ; %bb.0: ; %entry
10289; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10290; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10291; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10292; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10293; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10294; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10295; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10296; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10297; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10298; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10299; GFX12-CU-NEXT:    s_endpgm
10300    ptr %in, ptr %out) {
10301entry:
10302  %val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4
10303  store i32 %val, ptr %out
10304  ret void
10305}
10306
10307define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
10308; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
10309; GFX7:       ; %bb.0: ; %entry
10310; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10311; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
10312; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10313; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10314; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10315; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10316; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10317; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10318; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10319; GFX7-NEXT:    flat_store_dword v[0:1], v2
10320; GFX7-NEXT:    s_endpgm
10321;
10322; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
10323; GFX10-WGP:       ; %bb.0: ; %entry
10324; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10325; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10326; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10327; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10328; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10329; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
10330; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
10331; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
10332; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10333; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10334; GFX10-WGP-NEXT:    s_endpgm
10335;
10336; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
10337; GFX10-CU:       ; %bb.0: ; %entry
10338; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10339; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10340; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10341; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10342; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10343; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
10344; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
10345; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
10346; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10347; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10348; GFX10-CU-NEXT:    s_endpgm
10349;
10350; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load:
10351; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10352; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10353; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
10354; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10355; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10357; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
10358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
10359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
10360; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10361; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10362; SKIP-CACHE-INV-NEXT:    s_endpgm
10363;
10364; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
10365; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10366; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10367; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10368; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10369; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10370; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10371; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10372; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10373; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10374; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10375;
10376; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
10377; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10378; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10379; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10380; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10381; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10382; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10383; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10384; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10385; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10386; GFX90A-TGSPLIT-NEXT:    s_endpgm
10387;
10388; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
10389; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10390; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10391; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10392; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10393; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10394; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10395; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10396; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10397; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10398; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10399;
10400; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
10401; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10402; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10403; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10404; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10405; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10406; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
10407; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
10408; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10409; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10410; GFX940-TGSPLIT-NEXT:    s_endpgm
10411;
10412; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
10413; GFX11-WGP:       ; %bb.0: ; %entry
10414; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10415; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10416; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10417; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10418; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10419; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10420; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
10421; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
10422; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10423; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10424; GFX11-WGP-NEXT:    s_endpgm
10425;
10426; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
10427; GFX11-CU:       ; %bb.0: ; %entry
10428; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10429; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10430; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10431; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10432; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10433; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
10434; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
10435; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
10436; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10437; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10438; GFX11-CU-NEXT:    s_endpgm
10439;
10440; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
10441; GFX12-WGP:       ; %bb.0: ; %entry
10442; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10443; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10444; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10445; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10446; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10447; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
10448; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
10449; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
10450; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
10451; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10452; GFX12-WGP-NEXT:    s_endpgm
10453;
10454; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
10455; GFX12-CU:       ; %bb.0: ; %entry
10456; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10457; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10458; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10459; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10460; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10461; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
10462; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
10463; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
10464; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
10465; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10466; GFX12-CU-NEXT:    s_endpgm
10467    ptr %in, ptr %out) {
10468entry:
10469  %val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4
10470  store i32 %val, ptr %out
10471  ret void
10472}
10473
10474define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
10475; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
10476; GFX7:       ; %bb.0: ; %entry
10477; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10478; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10479; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10480; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10481; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10482; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10483; GFX7-NEXT:    flat_store_dword v[0:1], v2
10484; GFX7-NEXT:    s_endpgm
10485;
10486; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
10487; GFX10-WGP:       ; %bb.0: ; %entry
10488; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10489; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10490; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10491; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10492; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10493; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10494; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10495; GFX10-WGP-NEXT:    s_endpgm
10496;
10497; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
10498; GFX10-CU:       ; %bb.0: ; %entry
10499; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10500; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10501; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10502; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10503; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10504; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10505; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10506; GFX10-CU-NEXT:    s_endpgm
10507;
10508; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store:
10509; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10510; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10511; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10512; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10514; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10515; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10516; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10517; SKIP-CACHE-INV-NEXT:    s_endpgm
10518;
10519; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
10520; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10521; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10522; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10523; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10524; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10525; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10526; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10527; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10528;
10529; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
10530; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10531; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10532; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10533; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10534; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10535; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10536; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10537; GFX90A-TGSPLIT-NEXT:    s_endpgm
10538;
10539; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
10540; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10541; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10542; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10543; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10544; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10545; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10546; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10547; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10548;
10549; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
10550; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10551; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10552; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10553; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10554; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10555; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10556; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10557; GFX940-TGSPLIT-NEXT:    s_endpgm
10558;
10559; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_store:
10560; GFX11-WGP:       ; %bb.0: ; %entry
10561; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10562; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10563; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10564; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10565; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10566; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10567; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10568; GFX11-WGP-NEXT:    s_endpgm
10569;
10570; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store:
10571; GFX11-CU:       ; %bb.0: ; %entry
10572; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10573; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10574; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10575; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10576; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10577; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10578; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10579; GFX11-CU-NEXT:    s_endpgm
10580;
10581; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_store:
10582; GFX12-WGP:       ; %bb.0: ; %entry
10583; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10584; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10585; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10586; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10587; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10588; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10589; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10590; GFX12-WGP-NEXT:    s_endpgm
10591;
10592; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_store:
10593; GFX12-CU:       ; %bb.0: ; %entry
10594; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10595; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10596; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10597; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10598; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10599; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10600; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10601; GFX12-CU-NEXT:    s_endpgm
10602    i32 %in, ptr %out) {
10603entry:
10604  store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
10605  ret void
10606}
10607
10608define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
10609; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
10610; GFX7:       ; %bb.0: ; %entry
10611; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10612; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10613; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10614; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10615; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10616; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10617; GFX7-NEXT:    flat_store_dword v[0:1], v2
10618; GFX7-NEXT:    s_endpgm
10619;
10620; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
10621; GFX10-WGP:       ; %bb.0: ; %entry
10622; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10623; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10624; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10625; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10626; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10627; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10628; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10629; GFX10-WGP-NEXT:    s_endpgm
10630;
10631; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
10632; GFX10-CU:       ; %bb.0: ; %entry
10633; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10634; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10635; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10636; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10637; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10638; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10639; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10640; GFX10-CU-NEXT:    s_endpgm
10641;
10642; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store:
10643; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10644; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10645; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10646; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10649; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10650; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10651; SKIP-CACHE-INV-NEXT:    s_endpgm
10652;
10653; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
10654; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10655; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10656; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10657; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10658; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10659; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10660; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10661; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10662;
10663; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
10664; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10665; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10666; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10667; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10668; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10669; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10670; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10671; GFX90A-TGSPLIT-NEXT:    s_endpgm
10672;
10673; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
10674; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10675; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10676; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10677; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10678; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10679; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10680; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10681; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10682;
10683; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
10684; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10685; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10686; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10687; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10688; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10689; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10690; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10691; GFX940-TGSPLIT-NEXT:    s_endpgm
10692;
10693; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
10694; GFX11-WGP:       ; %bb.0: ; %entry
10695; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10696; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10697; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10698; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10699; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10700; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10701; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10702; GFX11-WGP-NEXT:    s_endpgm
10703;
10704; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store:
10705; GFX11-CU:       ; %bb.0: ; %entry
10706; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10707; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10708; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10709; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10710; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10711; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10712; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10713; GFX11-CU-NEXT:    s_endpgm
10714;
10715; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
10716; GFX12-WGP:       ; %bb.0: ; %entry
10717; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10718; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10719; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10720; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10721; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10722; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10723; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10724; GFX12-WGP-NEXT:    s_endpgm
10725;
10726; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_store:
10727; GFX12-CU:       ; %bb.0: ; %entry
10728; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10729; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10730; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10731; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10732; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10733; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10734; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10735; GFX12-CU-NEXT:    s_endpgm
10736    i32 %in, ptr %out) {
10737entry:
10738  store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
10739  ret void
10740}
10741
10742define amdgpu_kernel void @flat_wavefront_one_as_release_store(
10743; GFX7-LABEL: flat_wavefront_one_as_release_store:
10744; GFX7:       ; %bb.0: ; %entry
10745; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10746; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10747; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10748; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10749; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10750; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10751; GFX7-NEXT:    flat_store_dword v[0:1], v2
10752; GFX7-NEXT:    s_endpgm
10753;
10754; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
10755; GFX10-WGP:       ; %bb.0: ; %entry
10756; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10757; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10758; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10759; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10760; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10761; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10762; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10763; GFX10-WGP-NEXT:    s_endpgm
10764;
10765; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
10766; GFX10-CU:       ; %bb.0: ; %entry
10767; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10768; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10769; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10770; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10771; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10772; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10773; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10774; GFX10-CU-NEXT:    s_endpgm
10775;
10776; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store:
10777; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10778; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10779; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10780; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10784; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10785; SKIP-CACHE-INV-NEXT:    s_endpgm
10786;
10787; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
10788; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10789; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10790; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10791; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10792; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10793; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10794; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10795; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10796;
10797; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
10798; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10799; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10800; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10801; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10802; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10803; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10804; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10805; GFX90A-TGSPLIT-NEXT:    s_endpgm
10806;
10807; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
10808; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10809; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10810; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10811; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10812; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10813; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10814; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10815; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10816;
10817; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
10818; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10819; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10820; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10821; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10822; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10823; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10824; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10825; GFX940-TGSPLIT-NEXT:    s_endpgm
10826;
10827; GFX11-WGP-LABEL: flat_wavefront_one_as_release_store:
10828; GFX11-WGP:       ; %bb.0: ; %entry
10829; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10830; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10831; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10832; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10833; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10834; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10835; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10836; GFX11-WGP-NEXT:    s_endpgm
10837;
10838; GFX11-CU-LABEL: flat_wavefront_one_as_release_store:
10839; GFX11-CU:       ; %bb.0: ; %entry
10840; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10841; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10842; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10843; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10844; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10845; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10846; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10847; GFX11-CU-NEXT:    s_endpgm
10848;
10849; GFX12-WGP-LABEL: flat_wavefront_one_as_release_store:
10850; GFX12-WGP:       ; %bb.0: ; %entry
10851; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10852; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10853; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10854; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10855; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10856; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10857; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10858; GFX12-WGP-NEXT:    s_endpgm
10859;
10860; GFX12-CU-LABEL: flat_wavefront_one_as_release_store:
10861; GFX12-CU:       ; %bb.0: ; %entry
10862; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10863; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10864; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10865; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
10866; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
10867; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
10868; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
10869; GFX12-CU-NEXT:    s_endpgm
10870    i32 %in, ptr %out) {
10871entry:
10872  store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
10873  ret void
10874}
10875
10876define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
10877; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
10878; GFX7:       ; %bb.0: ; %entry
10879; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10880; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10881; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10882; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10883; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10884; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10885; GFX7-NEXT:    flat_store_dword v[0:1], v2
10886; GFX7-NEXT:    s_endpgm
10887;
10888; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
10889; GFX10-WGP:       ; %bb.0: ; %entry
10890; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
10891; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10892; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10893; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
10894; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
10895; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
10896; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
10897; GFX10-WGP-NEXT:    s_endpgm
10898;
10899; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
10900; GFX10-CU:       ; %bb.0: ; %entry
10901; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
10902; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10903; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10904; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
10905; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
10906; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
10907; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
10908; GFX10-CU-NEXT:    s_endpgm
10909;
10910; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store:
10911; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10912; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
10913; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
10914; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
10916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
10917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
10918; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
10919; SKIP-CACHE-INV-NEXT:    s_endpgm
10920;
10921; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
10922; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10923; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10924; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10925; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10926; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10927; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10928; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10929; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10930;
10931; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
10932; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10933; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
10934; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
10935; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10936; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10937; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
10938; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
10939; GFX90A-TGSPLIT-NEXT:    s_endpgm
10940;
10941; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
10942; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10943; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10944; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10945; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10946; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10947; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10948; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10949; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10950;
10951; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
10952; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10953; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
10954; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
10955; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10956; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
10957; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
10958; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
10959; GFX940-TGSPLIT-NEXT:    s_endpgm
10960;
10961; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
10962; GFX11-WGP:       ; %bb.0: ; %entry
10963; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10964; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10965; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10966; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
10967; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
10968; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
10969; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
10970; GFX11-WGP-NEXT:    s_endpgm
10971;
10972; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
10973; GFX11-CU:       ; %bb.0: ; %entry
10974; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10975; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10976; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10977; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
10978; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
10979; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
10980; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
10981; GFX11-CU-NEXT:    s_endpgm
10982;
10983; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
10984; GFX12-WGP:       ; %bb.0: ; %entry
10985; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
10986; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10987; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10988; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
10989; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
10990; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
10991; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
10992; GFX12-WGP-NEXT:    s_endpgm
10993;
10994; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
10995; GFX12-CU:       ; %bb.0: ; %entry
10996; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
10997; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
10998; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10999; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11000; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11001; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11002; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11003; GFX12-CU-NEXT:    s_endpgm
11004    i32 %in, ptr %out) {
11005entry:
11006  store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
11007  ret void
11008}
11009
11010define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
11011; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11012; GFX7:       ; %bb.0: ; %entry
11013; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11014; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11015; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11016; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11017; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11018; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11019; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11020; GFX7-NEXT:    s_endpgm
11021;
11022; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11023; GFX10-WGP:       ; %bb.0: ; %entry
11024; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11025; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11026; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11027; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11028; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11029; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11030; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11031; GFX10-WGP-NEXT:    s_endpgm
11032;
11033; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11034; GFX10-CU:       ; %bb.0: ; %entry
11035; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11036; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11037; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11038; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11039; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11040; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11041; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11042; GFX10-CU-NEXT:    s_endpgm
11043;
11044; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11045; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11046; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11047; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11048; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11050; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11051; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11052; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11053; SKIP-CACHE-INV-NEXT:    s_endpgm
11054;
11055; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11056; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11057; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11058; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11059; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11060; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11061; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11062; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11063; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11064;
11065; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11066; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11067; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11068; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11069; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11070; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11071; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11072; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11073; GFX90A-TGSPLIT-NEXT:    s_endpgm
11074;
11075; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11076; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11077; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11078; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11079; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11080; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11081; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11082; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11083; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11084;
11085; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11086; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11087; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11088; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11089; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11090; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11091; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11092; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11093; GFX940-TGSPLIT-NEXT:    s_endpgm
11094;
11095; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11096; GFX11-WGP:       ; %bb.0: ; %entry
11097; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11098; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11099; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11100; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11101; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11102; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11103; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11104; GFX11-WGP-NEXT:    s_endpgm
11105;
11106; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11107; GFX11-CU:       ; %bb.0: ; %entry
11108; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11109; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11110; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11111; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11112; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11113; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11114; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11115; GFX11-CU-NEXT:    s_endpgm
11116;
11117; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11118; GFX12-WGP:       ; %bb.0: ; %entry
11119; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11120; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11121; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11122; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11123; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11124; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11125; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11126; GFX12-WGP-NEXT:    s_endpgm
11127;
11128; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
11129; GFX12-CU:       ; %bb.0: ; %entry
11130; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11131; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11132; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11133; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11134; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11135; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11136; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11137; GFX12-CU-NEXT:    s_endpgm
11138    ptr %out, i32 %in) {
11139entry:
11140  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
11141  ret void
11142}
11143
11144define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
11145; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11146; GFX7:       ; %bb.0: ; %entry
11147; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11148; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11149; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11150; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11151; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11152; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11153; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11154; GFX7-NEXT:    s_endpgm
11155;
11156; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11157; GFX10-WGP:       ; %bb.0: ; %entry
11158; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11159; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11160; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11161; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11162; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11163; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11164; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11165; GFX10-WGP-NEXT:    s_endpgm
11166;
11167; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11168; GFX10-CU:       ; %bb.0: ; %entry
11169; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11170; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11171; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11172; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11173; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11174; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11175; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11176; GFX10-CU-NEXT:    s_endpgm
11177;
11178; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11179; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11180; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11181; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11182; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11184; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11185; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11186; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11187; SKIP-CACHE-INV-NEXT:    s_endpgm
11188;
11189; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11190; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11191; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11192; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11193; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11194; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11195; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11196; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11197; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11198;
11199; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11200; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11201; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11202; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11203; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11204; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11205; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11206; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11207; GFX90A-TGSPLIT-NEXT:    s_endpgm
11208;
11209; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11210; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11211; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11212; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11213; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11214; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11215; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11216; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11217; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11218;
11219; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11220; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11221; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11222; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11223; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11224; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11225; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11226; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11227; GFX940-TGSPLIT-NEXT:    s_endpgm
11228;
11229; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11230; GFX11-WGP:       ; %bb.0: ; %entry
11231; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11232; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11233; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11234; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11235; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11236; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11237; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11238; GFX11-WGP-NEXT:    s_endpgm
11239;
11240; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11241; GFX11-CU:       ; %bb.0: ; %entry
11242; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11243; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11244; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11245; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11246; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11247; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11248; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11249; GFX11-CU-NEXT:    s_endpgm
11250;
11251; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11252; GFX12-WGP:       ; %bb.0: ; %entry
11253; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11254; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11255; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11256; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11257; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11258; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11259; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11260; GFX12-WGP-NEXT:    s_endpgm
11261;
11262; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
11263; GFX12-CU:       ; %bb.0: ; %entry
11264; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11265; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11266; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11267; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11268; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11269; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11270; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11271; GFX12-CU-NEXT:    s_endpgm
11272    ptr %out, i32 %in) {
11273entry:
11274  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
11275  ret void
11276}
11277
11278define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
11279; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
11280; GFX7:       ; %bb.0: ; %entry
11281; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11282; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11283; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11284; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11285; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11286; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11287; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11288; GFX7-NEXT:    s_endpgm
11289;
11290; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
11291; GFX10-WGP:       ; %bb.0: ; %entry
11292; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11293; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11294; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11295; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11296; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11297; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11298; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11299; GFX10-WGP-NEXT:    s_endpgm
11300;
11301; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
11302; GFX10-CU:       ; %bb.0: ; %entry
11303; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11304; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11305; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11306; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11307; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11308; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11309; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11310; GFX10-CU-NEXT:    s_endpgm
11311;
11312; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw:
11313; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11314; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11315; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11316; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11317; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11318; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11319; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11320; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11321; SKIP-CACHE-INV-NEXT:    s_endpgm
11322;
11323; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
11324; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11325; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11326; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11327; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11328; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11329; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11330; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11331; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11332;
11333; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
11334; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11335; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11336; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11337; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11338; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11339; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11340; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11341; GFX90A-TGSPLIT-NEXT:    s_endpgm
11342;
11343; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
11344; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11345; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11346; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11347; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11348; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11349; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11350; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11351; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11352;
11353; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
11354; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11355; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11356; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11357; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11358; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11359; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11360; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11361; GFX940-TGSPLIT-NEXT:    s_endpgm
11362;
11363; GFX11-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
11364; GFX11-WGP:       ; %bb.0: ; %entry
11365; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11366; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11367; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11368; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11369; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11370; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11371; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11372; GFX11-WGP-NEXT:    s_endpgm
11373;
11374; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
11375; GFX11-CU:       ; %bb.0: ; %entry
11376; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11377; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11378; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11379; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11380; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11381; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11382; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11383; GFX11-CU-NEXT:    s_endpgm
11384;
11385; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
11386; GFX12-WGP:       ; %bb.0: ; %entry
11387; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11388; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11389; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11390; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11391; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11392; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11393; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11394; GFX12-WGP-NEXT:    s_endpgm
11395;
11396; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
11397; GFX12-CU:       ; %bb.0: ; %entry
11398; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11399; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11400; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11401; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11402; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11403; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11404; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11405; GFX12-CU-NEXT:    s_endpgm
11406    ptr %out, i32 %in) {
11407entry:
11408  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
11409  ret void
11410}
11411
11412define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
11413; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11414; GFX7:       ; %bb.0: ; %entry
11415; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11416; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11417; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11418; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11419; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11420; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11421; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11422; GFX7-NEXT:    s_endpgm
11423;
11424; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11425; GFX10-WGP:       ; %bb.0: ; %entry
11426; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11427; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11428; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11429; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11430; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11431; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11432; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11433; GFX10-WGP-NEXT:    s_endpgm
11434;
11435; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11436; GFX10-CU:       ; %bb.0: ; %entry
11437; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11438; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11439; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11440; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11441; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11442; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11443; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11444; GFX10-CU-NEXT:    s_endpgm
11445;
11446; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11447; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11448; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11449; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11450; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11451; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11452; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11454; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11455; SKIP-CACHE-INV-NEXT:    s_endpgm
11456;
11457; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11458; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11459; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11460; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11461; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11462; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11463; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11464; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11465; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11466;
11467; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11468; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11469; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11470; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11471; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11472; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11473; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11474; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11475; GFX90A-TGSPLIT-NEXT:    s_endpgm
11476;
11477; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11478; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11479; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11480; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11481; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11482; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11483; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11484; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11485; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11486;
11487; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11488; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11489; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11490; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11491; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11492; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11493; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11494; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11495; GFX940-TGSPLIT-NEXT:    s_endpgm
11496;
11497; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11498; GFX11-WGP:       ; %bb.0: ; %entry
11499; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11500; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11501; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11502; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11503; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11504; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11505; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11506; GFX11-WGP-NEXT:    s_endpgm
11507;
11508; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11509; GFX11-CU:       ; %bb.0: ; %entry
11510; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11511; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11512; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11513; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11514; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11515; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11516; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11517; GFX11-CU-NEXT:    s_endpgm
11518;
11519; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11520; GFX12-WGP:       ; %bb.0: ; %entry
11521; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11522; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11523; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11524; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11525; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11526; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11527; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11528; GFX12-WGP-NEXT:    s_endpgm
11529;
11530; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
11531; GFX12-CU:       ; %bb.0: ; %entry
11532; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11533; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11534; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11535; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11536; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11537; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11538; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11539; GFX12-CU-NEXT:    s_endpgm
11540    ptr %out, i32 %in) {
11541entry:
11542  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
11543  ret void
11544}
11545
11546define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
11547; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11548; GFX7:       ; %bb.0: ; %entry
11549; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11550; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11551; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11552; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11553; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11554; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11555; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11556; GFX7-NEXT:    s_endpgm
11557;
11558; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11559; GFX10-WGP:       ; %bb.0: ; %entry
11560; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11561; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
11562; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11563; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
11564; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
11565; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
11566; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
11567; GFX10-WGP-NEXT:    s_endpgm
11568;
11569; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11570; GFX10-CU:       ; %bb.0: ; %entry
11571; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11572; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
11573; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11574; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
11575; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
11576; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
11577; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
11578; GFX10-CU-NEXT:    s_endpgm
11579;
11580; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11581; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11582; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11583; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
11584; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11585; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
11586; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
11587; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
11588; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
11589; SKIP-CACHE-INV-NEXT:    s_endpgm
11590;
11591; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11592; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11593; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11594; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11595; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11596; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11597; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11598; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11599; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11600;
11601; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11602; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11603; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11604; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
11605; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11606; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11607; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
11608; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11609; GFX90A-TGSPLIT-NEXT:    s_endpgm
11610;
11611; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11612; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11613; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11614; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11615; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11616; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11617; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11618; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11619; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11620;
11621; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11622; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11623; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
11624; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
11625; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11626; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
11627; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
11628; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
11629; GFX940-TGSPLIT-NEXT:    s_endpgm
11630;
11631; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11632; GFX11-WGP:       ; %bb.0: ; %entry
11633; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11634; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11635; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11636; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
11637; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
11638; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
11639; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11640; GFX11-WGP-NEXT:    s_endpgm
11641;
11642; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11643; GFX11-CU:       ; %bb.0: ; %entry
11644; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11645; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11646; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11647; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
11648; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
11649; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
11650; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11651; GFX11-CU-NEXT:    s_endpgm
11652;
11653; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11654; GFX12-WGP:       ; %bb.0: ; %entry
11655; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11656; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
11657; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11658; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
11659; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
11660; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
11661; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11662; GFX12-WGP-NEXT:    s_endpgm
11663;
11664; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
11665; GFX12-CU:       ; %bb.0: ; %entry
11666; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
11667; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
11668; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11669; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
11670; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
11671; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
11672; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
11673; GFX12-CU-NEXT:    s_endpgm
11674    ptr %out, i32 %in) {
11675entry:
11676  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
11677  ret void
11678}
11679
11680define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
11681; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11682; GFX7:       ; %bb.0: ; %entry
11683; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11684; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
11685; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11686; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11687; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11688; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11689; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11690; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11691; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11692; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11693; GFX7-NEXT:    flat_store_dword v[0:1], v2
11694; GFX7-NEXT:    s_endpgm
11695;
11696; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11697; GFX10-WGP:       ; %bb.0: ; %entry
11698; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11699; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11700; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11701; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11702; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11703; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
11704; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11705; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11706; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11707; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11708; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11709; GFX10-WGP-NEXT:    s_endpgm
11710;
11711; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11712; GFX10-CU:       ; %bb.0: ; %entry
11713; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11714; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11715; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11716; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11717; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11718; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
11719; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11720; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11721; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11722; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11723; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11724; GFX10-CU-NEXT:    s_endpgm
11725;
11726; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11727; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11728; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11729; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
11730; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11731; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11732; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11733; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
11734; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11735; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11737; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11738; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11739; SKIP-CACHE-INV-NEXT:    s_endpgm
11740;
11741; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11742; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11743; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11744; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11745; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11746; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11747; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11748; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11749; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11750; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11751; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11752; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11753;
11754; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11755; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11756; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11757; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11758; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11759; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11760; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11761; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11762; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11763; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11764; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11765; GFX90A-TGSPLIT-NEXT:    s_endpgm
11766;
11767; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11768; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11769; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11770; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11771; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11772; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11773; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11774; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11775; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11776; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11777; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11778; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11779;
11780; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11781; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11782; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11783; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11784; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11785; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11786; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11787; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11788; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11789; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11790; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11791; GFX940-TGSPLIT-NEXT:    s_endpgm
11792;
11793; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11794; GFX11-WGP:       ; %bb.0: ; %entry
11795; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11796; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11797; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11798; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11799; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11800; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
11801; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11802; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11803; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11804; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11805; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11806; GFX11-WGP-NEXT:    s_endpgm
11807;
11808; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11809; GFX11-CU:       ; %bb.0: ; %entry
11810; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11811; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11812; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11813; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11814; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11815; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
11816; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11817; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11818; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11819; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11820; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
11821; GFX11-CU-NEXT:    s_endpgm
11822;
11823; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11824; GFX12-WGP:       ; %bb.0: ; %entry
11825; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11826; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11827; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11828; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11829; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11830; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
11831; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
11832; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
11833; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
11834; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
11835; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
11836; GFX12-WGP-NEXT:    s_endpgm
11837;
11838; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
11839; GFX12-CU:       ; %bb.0: ; %entry
11840; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11841; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11842; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11843; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11844; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11845; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
11846; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
11847; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
11848; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
11849; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
11850; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
11851; GFX12-CU-NEXT:    s_endpgm
11852    ptr %out, i32 %in) {
11853entry:
11854  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
11855  store i32 %val, ptr %out, align 4
11856  ret void
11857}
11858
11859define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
11860; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11861; GFX7:       ; %bb.0: ; %entry
11862; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11863; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
11864; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11865; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11866; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11867; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11868; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11869; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11870; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11871; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11872; GFX7-NEXT:    flat_store_dword v[0:1], v2
11873; GFX7-NEXT:    s_endpgm
11874;
11875; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11876; GFX10-WGP:       ; %bb.0: ; %entry
11877; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11878; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11879; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11880; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11881; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11882; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
11883; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11884; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
11885; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
11886; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11887; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
11888; GFX10-WGP-NEXT:    s_endpgm
11889;
11890; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11891; GFX10-CU:       ; %bb.0: ; %entry
11892; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11893; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11894; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11895; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11896; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11897; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
11898; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
11900; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
11901; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11902; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
11903; GFX10-CU-NEXT:    s_endpgm
11904;
11905; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11906; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11907; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11908; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
11909; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11910; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11911; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
11913; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
11915; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
11916; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11917; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
11918; SKIP-CACHE-INV-NEXT:    s_endpgm
11919;
11920; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11921; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11922; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11923; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11924; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11925; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11926; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11927; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11928; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11929; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11930; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11931; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11932;
11933; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11934; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11935; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11936; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11937; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11938; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11939; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
11940; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11941; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11942; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11943; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
11944; GFX90A-TGSPLIT-NEXT:    s_endpgm
11945;
11946; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11947; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11948; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11949; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11950; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11951; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11952; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11953; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11954; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11955; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11956; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11957; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11958;
11959; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11960; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11961; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11962; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11963; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11964; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11965; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
11966; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
11967; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
11968; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11969; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
11970; GFX940-TGSPLIT-NEXT:    s_endpgm
11971;
11972; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11973; GFX11-WGP:       ; %bb.0: ; %entry
11974; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11975; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11976; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11977; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11978; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11979; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
11980; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11981; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
11982; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
11983; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11984; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
11985; GFX11-WGP-NEXT:    s_endpgm
11986;
11987; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
11988; GFX11-CU:       ; %bb.0: ; %entry
11989; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11990; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11991; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11992; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11993; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11994; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
11995; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
11996; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
11997; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
11998; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11999; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12000; GFX11-CU-NEXT:    s_endpgm
12001;
12002; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
12003; GFX12-WGP:       ; %bb.0: ; %entry
12004; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12005; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12006; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12007; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12008; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12009; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
12010; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12011; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12012; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12013; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
12014; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12015; GFX12-WGP-NEXT:    s_endpgm
12016;
12017; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
12018; GFX12-CU:       ; %bb.0: ; %entry
12019; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12020; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12021; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12022; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12023; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12024; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
12025; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12026; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12027; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12028; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
12029; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12030; GFX12-CU-NEXT:    s_endpgm
12031    ptr %out, i32 %in) {
12032entry:
12033  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
12034  store i32 %val, ptr %out, align 4
12035  ret void
12036}
12037
12038define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
12039; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12040; GFX7:       ; %bb.0: ; %entry
12041; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12042; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
12043; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12044; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12045; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12046; GFX7-NEXT:    v_mov_b32_e32 v2, s6
12047; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12048; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12049; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12050; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12051; GFX7-NEXT:    flat_store_dword v[0:1], v2
12052; GFX7-NEXT:    s_endpgm
12053;
12054; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12055; GFX10-WGP:       ; %bb.0: ; %entry
12056; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12057; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
12058; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12059; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12060; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12061; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
12062; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12063; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12064; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12065; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12066; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
12067; GFX10-WGP-NEXT:    s_endpgm
12068;
12069; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12070; GFX10-CU:       ; %bb.0: ; %entry
12071; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12072; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
12073; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12074; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12075; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12076; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
12077; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12078; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12079; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12080; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12081; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
12082; GFX10-CU-NEXT:    s_endpgm
12083;
12084; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12085; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12086; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12087; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
12088; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
12092; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12093; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12095; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12096; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
12097; SKIP-CACHE-INV-NEXT:    s_endpgm
12098;
12099; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12100; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12101; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12103; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12104; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12105; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12106; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12107; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12109; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12110; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12111;
12112; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12113; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12114; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12115; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12116; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12117; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12118; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
12119; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12120; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12121; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12122; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
12123; GFX90A-TGSPLIT-NEXT:    s_endpgm
12124;
12125; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12126; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12127; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12128; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12129; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12130; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12131; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12132; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12133; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12134; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12135; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12136; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12137;
12138; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12139; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12140; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12141; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12142; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12143; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12144; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
12145; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
12146; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12147; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12148; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
12149; GFX940-TGSPLIT-NEXT:    s_endpgm
12150;
12151; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12152; GFX11-WGP:       ; %bb.0: ; %entry
12153; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12154; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12155; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12156; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12157; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12158; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
12159; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12160; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12161; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12162; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12163; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
12164; GFX11-WGP-NEXT:    s_endpgm
12165;
12166; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12167; GFX11-CU:       ; %bb.0: ; %entry
12168; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12169; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12170; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12171; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12172; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12173; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
12174; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
12175; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12176; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12177; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12178; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
12179; GFX11-CU-NEXT:    s_endpgm
12180;
12181; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12182; GFX12-WGP:       ; %bb.0: ; %entry
12183; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12184; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12185; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12186; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12187; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12188; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
12189; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12190; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12191; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12192; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
12193; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
12194; GFX12-WGP-NEXT:    s_endpgm
12195;
12196; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
12197; GFX12-CU:       ; %bb.0: ; %entry
12198; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12199; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12200; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12201; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12202; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12203; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
12204; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12205; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12206; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12207; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
12208; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
12209; GFX12-CU-NEXT:    s_endpgm
12210    ptr %out, i32 %in) {
12211entry:
12212  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
12213  store i32 %val, ptr %out, align 4
12214  ret void
12215}
12216
12217define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
12218; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12219; GFX7:       ; %bb.0: ; %entry
12220; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12221; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12222; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12223; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12224; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12225; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12226; GFX7-NEXT:    s_mov_b32 s4, s8
12227; GFX7-NEXT:    s_mov_b32 s5, s9
12228; GFX7-NEXT:    s_mov_b32 s9, s10
12229; GFX7-NEXT:    s_mov_b32 s8, s11
12230; GFX7-NEXT:    s_add_u32 s4, s4, s9
12231; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12232; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12233; GFX7-NEXT:    s_mov_b32 s5, s8
12234; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12235; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12236; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12237; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12238; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12239; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12240; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12241; GFX7-NEXT:    s_endpgm
12242;
12243; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12244; GFX10-WGP:       ; %bb.0: ; %entry
12245; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12246; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12247; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12248; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12249; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12250; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12251; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12252; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12253; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12254; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12255; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12256; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12257; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12258; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12259; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12260; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12261; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12262; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12263; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12264; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12265; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12266; GFX10-WGP-NEXT:    s_endpgm
12267;
12268; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12269; GFX10-CU:       ; %bb.0: ; %entry
12270; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12271; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12272; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12273; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12274; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12275; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12276; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12277; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12278; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12279; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12280; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12281; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12282; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12283; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12284; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12285; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12286; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12287; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12288; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12289; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12290; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12291; GFX10-CU-NEXT:    s_endpgm
12292;
12293; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12294; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12295; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12296; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12297; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12298; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12299; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12300; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12302; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12303; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12304; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12305; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12306; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12307; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12310; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12311; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12313; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12314; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12315; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12316; SKIP-CACHE-INV-NEXT:    s_endpgm
12317;
12318; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12319; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12320; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12321; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12322; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12323; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12324; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12325; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12326; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12327; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12328; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12329; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12330; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12331;
12332; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12333; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12334; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12335; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12336; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12337; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12338; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12339; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12340; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12342; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12343; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12344; GFX90A-TGSPLIT-NEXT:    s_endpgm
12345;
12346; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12347; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12348; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12349; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12350; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12351; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12352; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12353; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12354; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12355; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12356; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12357; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12358; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12359;
12360; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12361; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12362; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12363; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12364; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12365; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12366; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12367; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12368; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12369; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12370; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12371; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12372; GFX940-TGSPLIT-NEXT:    s_endpgm
12373;
12374; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12375; GFX11-WGP:       ; %bb.0: ; %entry
12376; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12377; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12378; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12379; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12380; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
12381; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12382; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12383; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
12384; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12385; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12386; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12387; GFX11-WGP-NEXT:    s_endpgm
12388;
12389; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12390; GFX11-CU:       ; %bb.0: ; %entry
12391; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12392; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12393; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12394; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12395; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
12396; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12397; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12398; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
12399; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12400; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12401; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12402; GFX11-CU-NEXT:    s_endpgm
12403;
12404; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12405; GFX12-WGP:       ; %bb.0: ; %entry
12406; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12407; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12408; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12409; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12410; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
12411; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12412; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12413; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
12414; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12415; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12416; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12417; GFX12-WGP-NEXT:    s_endpgm
12418;
12419; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
12420; GFX12-CU:       ; %bb.0: ; %entry
12421; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12422; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12423; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12424; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12425; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
12426; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12427; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12428; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
12429; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12430; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12431; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12432; GFX12-CU-NEXT:    s_endpgm
12433    ptr %out, i32 %in, i32 %old) {
12434entry:
12435  %gep = getelementptr i32, ptr %out, i32 4
12436  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
12437  ret void
12438}
12439
12440define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
12441; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12442; GFX7:       ; %bb.0: ; %entry
12443; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12444; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12445; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12446; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12447; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12448; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12449; GFX7-NEXT:    s_mov_b32 s4, s8
12450; GFX7-NEXT:    s_mov_b32 s5, s9
12451; GFX7-NEXT:    s_mov_b32 s9, s10
12452; GFX7-NEXT:    s_mov_b32 s8, s11
12453; GFX7-NEXT:    s_add_u32 s4, s4, s9
12454; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12455; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12456; GFX7-NEXT:    s_mov_b32 s5, s8
12457; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12458; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12459; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12460; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12461; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12462; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12463; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12464; GFX7-NEXT:    s_endpgm
12465;
12466; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12467; GFX10-WGP:       ; %bb.0: ; %entry
12468; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12469; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12470; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12471; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12472; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12473; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12474; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12475; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12476; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12477; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12478; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12479; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12480; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12481; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12482; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12483; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12484; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12485; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12486; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12487; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12488; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12489; GFX10-WGP-NEXT:    s_endpgm
12490;
12491; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12492; GFX10-CU:       ; %bb.0: ; %entry
12493; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12494; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12495; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12496; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12497; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12498; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12499; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12500; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12501; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12502; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12503; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12504; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12505; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12506; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12507; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12508; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12509; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12510; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12511; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12512; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12513; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12514; GFX10-CU-NEXT:    s_endpgm
12515;
12516; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12517; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12518; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12519; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12520; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12521; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12522; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12523; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12524; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12525; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12526; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12527; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12528; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12529; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12530; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12531; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12532; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12534; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12537; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12538; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12539; SKIP-CACHE-INV-NEXT:    s_endpgm
12540;
12541; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12542; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12543; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12544; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12545; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12546; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12547; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12548; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12549; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12550; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12551; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12552; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12553; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12554;
12555; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12556; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12557; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12558; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12559; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12560; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12561; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12562; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12563; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12564; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12565; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12566; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12567; GFX90A-TGSPLIT-NEXT:    s_endpgm
12568;
12569; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12570; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12571; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12572; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12573; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12574; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12575; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12576; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12577; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12578; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12579; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12580; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12581; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12582;
12583; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12584; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12585; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12586; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12587; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12588; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12589; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12590; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12591; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12592; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12593; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12594; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12595; GFX940-TGSPLIT-NEXT:    s_endpgm
12596;
12597; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12598; GFX11-WGP:       ; %bb.0: ; %entry
12599; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12600; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12601; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12602; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12603; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
12604; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12605; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12606; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
12607; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12608; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12609; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12610; GFX11-WGP-NEXT:    s_endpgm
12611;
12612; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12613; GFX11-CU:       ; %bb.0: ; %entry
12614; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12615; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12616; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12617; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12618; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
12619; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12620; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12621; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
12622; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12623; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12624; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12625; GFX11-CU-NEXT:    s_endpgm
12626;
12627; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12628; GFX12-WGP:       ; %bb.0: ; %entry
12629; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12630; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12631; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12632; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12633; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
12634; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12635; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12636; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
12637; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12638; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12639; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12640; GFX12-WGP-NEXT:    s_endpgm
12641;
12642; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
12643; GFX12-CU:       ; %bb.0: ; %entry
12644; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12645; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12646; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12647; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12648; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
12649; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12650; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12651; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
12652; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12653; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12654; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12655; GFX12-CU-NEXT:    s_endpgm
12656    ptr %out, i32 %in, i32 %old) {
12657entry:
12658  %gep = getelementptr i32, ptr %out, i32 4
12659  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
12660  ret void
12661}
12662
12663define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
12664; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12665; GFX7:       ; %bb.0: ; %entry
12666; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12667; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12668; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12669; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12670; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12671; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12672; GFX7-NEXT:    s_mov_b32 s4, s8
12673; GFX7-NEXT:    s_mov_b32 s5, s9
12674; GFX7-NEXT:    s_mov_b32 s9, s10
12675; GFX7-NEXT:    s_mov_b32 s8, s11
12676; GFX7-NEXT:    s_add_u32 s4, s4, s9
12677; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12678; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12679; GFX7-NEXT:    s_mov_b32 s5, s8
12680; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12681; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12682; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12683; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12684; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12685; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12686; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12687; GFX7-NEXT:    s_endpgm
12688;
12689; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12690; GFX10-WGP:       ; %bb.0: ; %entry
12691; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12692; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12693; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12694; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12695; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12696; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12697; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12698; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12699; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12700; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12701; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12702; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12703; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12704; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12705; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12706; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12707; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12708; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12709; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12710; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12711; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12712; GFX10-WGP-NEXT:    s_endpgm
12713;
12714; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12715; GFX10-CU:       ; %bb.0: ; %entry
12716; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12717; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12718; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12719; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12720; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12721; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12722; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12723; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12724; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12725; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12726; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12727; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12728; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12729; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12730; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12731; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12732; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12733; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12734; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12735; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12736; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12737; GFX10-CU-NEXT:    s_endpgm
12738;
12739; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12740; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12741; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12742; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12743; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12744; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12745; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12746; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12747; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12748; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12751; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12752; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12753; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12754; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12756; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12757; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12758; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12760; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12761; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12762; SKIP-CACHE-INV-NEXT:    s_endpgm
12763;
12764; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12765; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12766; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12767; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12768; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12769; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12770; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12771; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12772; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12773; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12774; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12775; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12776; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12777;
12778; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12779; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12780; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12781; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12782; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12783; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12784; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12785; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12786; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12787; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12788; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12789; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12790; GFX90A-TGSPLIT-NEXT:    s_endpgm
12791;
12792; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12793; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12794; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12795; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12796; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12797; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12798; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12799; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12800; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12802; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12803; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12804; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12805;
12806; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12807; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12808; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12809; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12810; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12811; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12812; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12813; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
12814; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12815; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12816; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
12817; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12818; GFX940-TGSPLIT-NEXT:    s_endpgm
12819;
12820; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12821; GFX11-WGP:       ; %bb.0: ; %entry
12822; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12823; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12824; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12825; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12826; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
12827; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
12828; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12829; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
12830; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
12831; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
12832; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12833; GFX11-WGP-NEXT:    s_endpgm
12834;
12835; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12836; GFX11-CU:       ; %bb.0: ; %entry
12837; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12838; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12839; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12840; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12841; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
12842; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
12843; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12844; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
12845; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
12846; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
12847; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12848; GFX11-CU-NEXT:    s_endpgm
12849;
12850; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12851; GFX12-WGP:       ; %bb.0: ; %entry
12852; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12853; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12854; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12855; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12856; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
12857; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
12858; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12859; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
12860; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
12861; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
12862; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12863; GFX12-WGP-NEXT:    s_endpgm
12864;
12865; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
12866; GFX12-CU:       ; %bb.0: ; %entry
12867; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12868; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12869; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12870; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12871; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
12872; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
12873; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12874; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
12875; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
12876; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
12877; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12878; GFX12-CU-NEXT:    s_endpgm
12879    ptr %out, i32 %in, i32 %old) {
12880entry:
12881  %gep = getelementptr i32, ptr %out, i32 4
12882  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
12883  ret void
12884}
12885
12886define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
12887; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12888; GFX7:       ; %bb.0: ; %entry
12889; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12890; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12891; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12892; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12893; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12894; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12895; GFX7-NEXT:    s_mov_b32 s4, s8
12896; GFX7-NEXT:    s_mov_b32 s5, s9
12897; GFX7-NEXT:    s_mov_b32 s9, s10
12898; GFX7-NEXT:    s_mov_b32 s8, s11
12899; GFX7-NEXT:    s_add_u32 s4, s4, s9
12900; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12901; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12902; GFX7-NEXT:    s_mov_b32 s5, s8
12903; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12904; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12905; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12906; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12907; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12908; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12909; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12910; GFX7-NEXT:    s_endpgm
12911;
12912; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12913; GFX10-WGP:       ; %bb.0: ; %entry
12914; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
12915; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12916; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
12917; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
12918; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
12919; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12920; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
12921; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
12922; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
12923; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
12924; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
12925; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
12926; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12927; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
12928; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
12929; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
12930; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12931; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
12932; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
12933; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
12934; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12935; GFX10-WGP-NEXT:    s_endpgm
12936;
12937; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12938; GFX10-CU:       ; %bb.0: ; %entry
12939; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
12940; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12941; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
12942; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
12943; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
12944; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12945; GFX10-CU-NEXT:    s_mov_b32 s4, s8
12946; GFX10-CU-NEXT:    s_mov_b32 s5, s9
12947; GFX10-CU-NEXT:    s_mov_b32 s9, s10
12948; GFX10-CU-NEXT:    s_mov_b32 s8, s11
12949; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
12950; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
12951; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12952; GFX10-CU-NEXT:    s_mov_b32 s5, s8
12953; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
12954; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
12955; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12956; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
12957; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
12958; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
12959; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12960; GFX10-CU-NEXT:    s_endpgm
12961;
12962; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12963; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12964; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
12965; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
12966; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
12967; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
12968; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
12969; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12970; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
12971; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
12972; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
12973; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
12974; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
12975; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
12976; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12977; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
12978; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
12979; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
12980; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12981; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
12982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
12983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
12984; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12985; SKIP-CACHE-INV-NEXT:    s_endpgm
12986;
12987; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12988; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12989; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12990; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12991; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12992; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12993; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12994; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
12995; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12996; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
12997; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12998; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12999; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13000;
13001; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13002; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13003; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13004; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13005; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13006; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13007; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13008; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13009; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13010; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13011; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13012; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13013; GFX90A-TGSPLIT-NEXT:    s_endpgm
13014;
13015; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13016; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13017; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13018; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13019; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13020; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13021; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13022; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13023; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13024; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13025; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13026; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13027; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13028;
13029; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13030; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13031; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13032; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13033; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13034; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13035; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13036; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13037; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13038; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13039; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13040; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13041; GFX940-TGSPLIT-NEXT:    s_endpgm
13042;
13043; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13044; GFX11-WGP:       ; %bb.0: ; %entry
13045; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13046; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13047; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13048; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13049; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13050; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13051; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13052; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13053; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13054; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13055; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13056; GFX11-WGP-NEXT:    s_endpgm
13057;
13058; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13059; GFX11-CU:       ; %bb.0: ; %entry
13060; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13061; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13062; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13063; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13064; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13065; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13066; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13067; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13068; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13069; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13070; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13071; GFX11-CU-NEXT:    s_endpgm
13072;
13073; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13074; GFX12-WGP:       ; %bb.0: ; %entry
13075; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13076; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13077; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13078; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13079; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13080; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13081; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13082; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13083; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13084; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13085; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13086; GFX12-WGP-NEXT:    s_endpgm
13087;
13088; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13089; GFX12-CU:       ; %bb.0: ; %entry
13090; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13091; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13092; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13093; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13094; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13095; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13096; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13097; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13098; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13099; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13100; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13101; GFX12-CU-NEXT:    s_endpgm
13102    ptr %out, i32 %in, i32 %old) {
13103entry:
13104  %gep = getelementptr i32, ptr %out, i32 4
13105  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
13106  ret void
13107}
13108
13109define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
13110; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13111; GFX7:       ; %bb.0: ; %entry
13112; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13113; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13114; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13115; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13116; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13117; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13118; GFX7-NEXT:    s_mov_b32 s4, s8
13119; GFX7-NEXT:    s_mov_b32 s5, s9
13120; GFX7-NEXT:    s_mov_b32 s9, s10
13121; GFX7-NEXT:    s_mov_b32 s8, s11
13122; GFX7-NEXT:    s_add_u32 s4, s4, s9
13123; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13124; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13125; GFX7-NEXT:    s_mov_b32 s5, s8
13126; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13127; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13128; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13129; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13130; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13131; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13132; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13133; GFX7-NEXT:    s_endpgm
13134;
13135; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13136; GFX10-WGP:       ; %bb.0: ; %entry
13137; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13138; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13139; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13140; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13141; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13142; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13143; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13144; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13145; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13146; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13147; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13148; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13149; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13150; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13151; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13152; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13153; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13154; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13155; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13156; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13157; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13158; GFX10-WGP-NEXT:    s_endpgm
13159;
13160; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13161; GFX10-CU:       ; %bb.0: ; %entry
13162; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13163; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13164; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13165; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13166; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13167; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13168; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13169; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13170; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13171; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13172; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13173; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13174; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13175; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13176; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13177; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13178; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13179; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13180; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13181; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13182; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13183; GFX10-CU-NEXT:    s_endpgm
13184;
13185; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13186; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13187; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13188; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13189; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13190; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13191; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13192; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13193; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13194; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13195; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13196; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13197; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13198; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13199; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13200; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13201; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13202; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13203; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13206; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13207; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13208; SKIP-CACHE-INV-NEXT:    s_endpgm
13209;
13210; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13211; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13212; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13213; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13214; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13215; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13216; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13217; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13218; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13219; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13220; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13221; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13222; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13223;
13224; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13225; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13226; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13227; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13228; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13229; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13230; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13231; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13232; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13233; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13234; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13235; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13236; GFX90A-TGSPLIT-NEXT:    s_endpgm
13237;
13238; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13239; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13240; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13241; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13242; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13243; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13244; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13245; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13246; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13247; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13248; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13249; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13250; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13251;
13252; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13253; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13254; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13255; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13256; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13257; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13258; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13259; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13260; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13261; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13262; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13263; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13264; GFX940-TGSPLIT-NEXT:    s_endpgm
13265;
13266; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13267; GFX11-WGP:       ; %bb.0: ; %entry
13268; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13269; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13270; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13271; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13272; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13273; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13274; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13275; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13276; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13277; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13278; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13279; GFX11-WGP-NEXT:    s_endpgm
13280;
13281; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13282; GFX11-CU:       ; %bb.0: ; %entry
13283; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13284; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13285; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13286; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13287; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13288; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13289; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13290; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13291; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13292; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13293; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13294; GFX11-CU-NEXT:    s_endpgm
13295;
13296; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13297; GFX12-WGP:       ; %bb.0: ; %entry
13298; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13299; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13300; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13301; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13302; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13303; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13304; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13305; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13306; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13307; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13308; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13309; GFX12-WGP-NEXT:    s_endpgm
13310;
13311; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13312; GFX12-CU:       ; %bb.0: ; %entry
13313; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13314; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13315; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13316; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13317; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13318; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13319; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13320; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13321; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13322; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13323; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13324; GFX12-CU-NEXT:    s_endpgm
13325    ptr %out, i32 %in, i32 %old) {
13326entry:
13327  %gep = getelementptr i32, ptr %out, i32 4
13328  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
13329  ret void
13330}
13331
13332define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
13333; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13334; GFX7:       ; %bb.0: ; %entry
13335; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13336; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13337; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13338; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13339; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13340; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13341; GFX7-NEXT:    s_mov_b32 s4, s8
13342; GFX7-NEXT:    s_mov_b32 s5, s9
13343; GFX7-NEXT:    s_mov_b32 s9, s10
13344; GFX7-NEXT:    s_mov_b32 s8, s11
13345; GFX7-NEXT:    s_add_u32 s4, s4, s9
13346; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13347; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13348; GFX7-NEXT:    s_mov_b32 s5, s8
13349; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13350; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13351; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13352; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13353; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13354; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13355; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13356; GFX7-NEXT:    s_endpgm
13357;
13358; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13359; GFX10-WGP:       ; %bb.0: ; %entry
13360; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13361; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13362; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13363; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13364; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13365; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13366; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13367; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13368; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13369; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13370; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13371; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13372; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13373; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13374; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13375; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13376; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13377; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13378; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13379; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13380; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13381; GFX10-WGP-NEXT:    s_endpgm
13382;
13383; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13384; GFX10-CU:       ; %bb.0: ; %entry
13385; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13386; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13387; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13388; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13389; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13390; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13391; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13392; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13393; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13394; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13395; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13396; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13397; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13398; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13399; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13400; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13401; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13402; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13403; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13404; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13405; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13406; GFX10-CU-NEXT:    s_endpgm
13407;
13408; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13409; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13410; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13411; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13412; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13413; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13414; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13415; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13416; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13417; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13418; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13419; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13420; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13421; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13422; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13423; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13424; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13425; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13426; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13427; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13428; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13429; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13430; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13431; SKIP-CACHE-INV-NEXT:    s_endpgm
13432;
13433; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13434; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13435; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13436; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13437; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13438; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13439; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13440; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13441; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13442; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13443; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13444; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13445; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13446;
13447; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13448; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13449; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13450; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13451; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13452; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13453; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13454; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13455; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13456; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13457; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13458; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13459; GFX90A-TGSPLIT-NEXT:    s_endpgm
13460;
13461; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13462; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13463; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13464; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13465; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13466; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13467; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13468; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13469; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13470; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13471; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13472; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13473; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13474;
13475; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13476; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13477; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13478; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13479; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13480; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13481; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13482; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13483; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13484; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13485; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13486; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13487; GFX940-TGSPLIT-NEXT:    s_endpgm
13488;
13489; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13490; GFX11-WGP:       ; %bb.0: ; %entry
13491; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13492; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13493; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13494; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13495; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13496; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13497; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13498; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13499; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13500; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13501; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13502; GFX11-WGP-NEXT:    s_endpgm
13503;
13504; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13505; GFX11-CU:       ; %bb.0: ; %entry
13506; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13507; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13508; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13509; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13510; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13511; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13512; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13513; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13514; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13515; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13516; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13517; GFX11-CU-NEXT:    s_endpgm
13518;
13519; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13520; GFX12-WGP:       ; %bb.0: ; %entry
13521; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13522; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13523; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13524; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13525; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13526; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13527; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13528; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13529; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13530; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13531; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13532; GFX12-WGP-NEXT:    s_endpgm
13533;
13534; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
13535; GFX12-CU:       ; %bb.0: ; %entry
13536; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13537; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13538; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13539; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13540; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13541; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13542; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13543; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13544; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13545; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13546; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13547; GFX12-CU-NEXT:    s_endpgm
13548    ptr %out, i32 %in, i32 %old) {
13549entry:
13550  %gep = getelementptr i32, ptr %out, i32 4
13551  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
13552  ret void
13553}
13554
13555define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
13556; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13557; GFX7:       ; %bb.0: ; %entry
13558; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13559; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13560; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13561; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13562; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13563; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13564; GFX7-NEXT:    s_mov_b32 s4, s8
13565; GFX7-NEXT:    s_mov_b32 s5, s9
13566; GFX7-NEXT:    s_mov_b32 s9, s10
13567; GFX7-NEXT:    s_mov_b32 s8, s11
13568; GFX7-NEXT:    s_add_u32 s4, s4, s9
13569; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13570; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13571; GFX7-NEXT:    s_mov_b32 s5, s8
13572; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13573; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13574; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13575; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13576; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13577; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13578; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13579; GFX7-NEXT:    s_endpgm
13580;
13581; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13582; GFX10-WGP:       ; %bb.0: ; %entry
13583; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13584; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13585; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13586; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13587; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13588; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13589; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13590; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13591; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13592; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13593; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13594; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13595; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13596; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13597; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13598; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13599; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13600; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13601; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13602; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13603; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13604; GFX10-WGP-NEXT:    s_endpgm
13605;
13606; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13607; GFX10-CU:       ; %bb.0: ; %entry
13608; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13609; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13610; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13611; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13612; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13613; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13614; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13615; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13616; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13617; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13618; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13619; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13620; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13621; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13622; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13623; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13624; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13625; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13626; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13627; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13628; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13629; GFX10-CU-NEXT:    s_endpgm
13630;
13631; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13632; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13633; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13634; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13635; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13636; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13637; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13638; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13639; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13640; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13641; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13642; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13643; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13644; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13645; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13646; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13649; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13650; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13651; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13652; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13653; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13654; SKIP-CACHE-INV-NEXT:    s_endpgm
13655;
13656; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13657; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13658; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13659; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13660; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13661; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13662; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13663; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13664; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13665; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13666; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13667; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13668; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13669;
13670; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13671; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13672; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13673; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13674; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13675; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13676; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13677; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13678; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13679; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13680; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13681; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13682; GFX90A-TGSPLIT-NEXT:    s_endpgm
13683;
13684; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13685; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13686; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13687; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13688; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13689; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13690; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13691; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13692; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13693; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13694; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13695; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13696; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13697;
13698; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13699; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13700; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13701; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13702; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13703; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13704; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13705; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13706; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13707; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13708; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13709; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13710; GFX940-TGSPLIT-NEXT:    s_endpgm
13711;
13712; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13713; GFX11-WGP:       ; %bb.0: ; %entry
13714; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13715; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13716; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13717; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13718; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13719; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13720; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13721; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13722; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13723; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13724; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13725; GFX11-WGP-NEXT:    s_endpgm
13726;
13727; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13728; GFX11-CU:       ; %bb.0: ; %entry
13729; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13730; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13731; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13732; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13733; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13734; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13735; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13736; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13737; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13738; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13739; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13740; GFX11-CU-NEXT:    s_endpgm
13741;
13742; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13743; GFX12-WGP:       ; %bb.0: ; %entry
13744; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13745; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13746; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13747; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13748; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13749; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13750; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13751; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13752; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13753; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13754; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13755; GFX12-WGP-NEXT:    s_endpgm
13756;
13757; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
13758; GFX12-CU:       ; %bb.0: ; %entry
13759; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13760; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13761; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13762; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13763; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13764; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13765; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13766; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13767; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13768; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13769; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13770; GFX12-CU-NEXT:    s_endpgm
13771    ptr %out, i32 %in, i32 %old) {
13772entry:
13773  %gep = getelementptr i32, ptr %out, i32 4
13774  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
13775  ret void
13776}
13777
13778define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
13779; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13780; GFX7:       ; %bb.0: ; %entry
13781; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13782; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13783; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13784; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13785; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13786; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13787; GFX7-NEXT:    s_mov_b32 s4, s8
13788; GFX7-NEXT:    s_mov_b32 s5, s9
13789; GFX7-NEXT:    s_mov_b32 s9, s10
13790; GFX7-NEXT:    s_mov_b32 s8, s11
13791; GFX7-NEXT:    s_add_u32 s4, s4, s9
13792; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13793; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13794; GFX7-NEXT:    s_mov_b32 s5, s8
13795; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13796; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13797; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13798; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13799; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13800; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13801; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13802; GFX7-NEXT:    s_endpgm
13803;
13804; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13805; GFX10-WGP:       ; %bb.0: ; %entry
13806; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
13807; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13808; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
13809; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
13810; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
13811; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13812; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
13813; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
13814; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
13815; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
13816; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
13817; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
13818; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13819; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
13820; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
13821; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
13822; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13823; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
13824; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
13825; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
13826; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13827; GFX10-WGP-NEXT:    s_endpgm
13828;
13829; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13830; GFX10-CU:       ; %bb.0: ; %entry
13831; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
13832; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13833; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
13834; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
13835; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
13836; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13837; GFX10-CU-NEXT:    s_mov_b32 s4, s8
13838; GFX10-CU-NEXT:    s_mov_b32 s5, s9
13839; GFX10-CU-NEXT:    s_mov_b32 s9, s10
13840; GFX10-CU-NEXT:    s_mov_b32 s8, s11
13841; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
13842; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
13843; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13844; GFX10-CU-NEXT:    s_mov_b32 s5, s8
13845; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
13846; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
13847; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13848; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
13849; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
13850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
13851; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13852; GFX10-CU-NEXT:    s_endpgm
13853;
13854; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13855; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13856; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
13857; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
13858; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
13859; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
13860; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
13861; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13862; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
13863; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
13864; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
13865; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
13866; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
13867; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
13868; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13869; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
13870; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
13871; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
13872; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13873; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
13874; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
13875; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
13876; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13877; SKIP-CACHE-INV-NEXT:    s_endpgm
13878;
13879; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13880; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13881; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13882; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13883; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13884; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13885; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13886; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13887; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13888; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13889; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13890; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13891; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13892;
13893; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13894; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13895; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13896; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13897; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13898; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13899; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13900; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
13901; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13902; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13903; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13904; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13905; GFX90A-TGSPLIT-NEXT:    s_endpgm
13906;
13907; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13908; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13909; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13910; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13911; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13912; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13913; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13914; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13915; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13916; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13917; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13918; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13919; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13920;
13921; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13922; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13923; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13924; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13925; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13926; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13927; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13928; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
13929; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13930; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
13931; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
13932; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13933; GFX940-TGSPLIT-NEXT:    s_endpgm
13934;
13935; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13936; GFX11-WGP:       ; %bb.0: ; %entry
13937; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13938; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13939; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13940; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13941; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
13942; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
13943; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13944; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
13945; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
13946; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
13947; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13948; GFX11-WGP-NEXT:    s_endpgm
13949;
13950; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13951; GFX11-CU:       ; %bb.0: ; %entry
13952; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13953; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13954; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13955; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13956; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
13957; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
13958; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13959; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
13960; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
13961; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
13962; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13963; GFX11-CU-NEXT:    s_endpgm
13964;
13965; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13966; GFX12-WGP:       ; %bb.0: ; %entry
13967; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13968; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13969; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13970; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13971; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
13972; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
13973; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13974; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
13975; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
13976; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
13977; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13978; GFX12-WGP-NEXT:    s_endpgm
13979;
13980; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
13981; GFX12-CU:       ; %bb.0: ; %entry
13982; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13983; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13984; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13985; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13986; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
13987; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
13988; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13989; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
13990; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
13991; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
13992; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13993; GFX12-CU-NEXT:    s_endpgm
13994    ptr %out, i32 %in, i32 %old) {
13995entry:
13996  %gep = getelementptr i32, ptr %out, i32 4
13997  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
13998  ret void
13999}
14000
14001define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
14002; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14003; GFX7:       ; %bb.0: ; %entry
14004; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14005; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14006; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14007; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14008; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14009; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14010; GFX7-NEXT:    s_mov_b32 s4, s8
14011; GFX7-NEXT:    s_mov_b32 s5, s9
14012; GFX7-NEXT:    s_mov_b32 s9, s10
14013; GFX7-NEXT:    s_mov_b32 s8, s11
14014; GFX7-NEXT:    s_add_u32 s4, s4, s9
14015; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14016; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14017; GFX7-NEXT:    s_mov_b32 s5, s8
14018; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14019; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14020; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14021; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14022; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14023; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14024; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14025; GFX7-NEXT:    s_endpgm
14026;
14027; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14028; GFX10-WGP:       ; %bb.0: ; %entry
14029; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14030; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14031; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14032; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14033; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14034; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14035; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14036; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14037; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14038; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14039; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14040; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14041; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14042; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14043; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14044; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14045; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14046; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14047; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14048; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14049; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14050; GFX10-WGP-NEXT:    s_endpgm
14051;
14052; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14053; GFX10-CU:       ; %bb.0: ; %entry
14054; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14055; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14056; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14057; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14058; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14059; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14060; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14061; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14062; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14063; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14064; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14065; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14066; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14067; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14068; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14069; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14070; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14071; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14072; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14073; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14074; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14075; GFX10-CU-NEXT:    s_endpgm
14076;
14077; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14078; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14079; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14080; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14081; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14082; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14083; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14084; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14085; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14086; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14087; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14088; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14089; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14090; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14091; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14092; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14093; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14095; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14096; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14097; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14098; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14099; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14100; SKIP-CACHE-INV-NEXT:    s_endpgm
14101;
14102; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14103; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14104; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14105; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14106; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14107; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14108; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14109; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14110; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14111; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14112; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14113; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14114; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14115;
14116; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14117; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14119; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14120; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14121; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14122; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14123; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14124; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14125; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14126; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14127; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14128; GFX90A-TGSPLIT-NEXT:    s_endpgm
14129;
14130; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14131; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14132; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14133; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14134; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14135; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14136; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14137; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14138; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14139; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14140; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14141; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14142; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14143;
14144; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14145; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14146; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14147; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14148; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14149; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14150; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14151; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14152; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14153; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14154; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14155; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14156; GFX940-TGSPLIT-NEXT:    s_endpgm
14157;
14158; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14159; GFX11-WGP:       ; %bb.0: ; %entry
14160; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14161; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14162; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14163; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14164; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14165; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14166; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14167; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14168; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14169; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14170; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14171; GFX11-WGP-NEXT:    s_endpgm
14172;
14173; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14174; GFX11-CU:       ; %bb.0: ; %entry
14175; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14176; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14177; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14178; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14179; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14180; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14181; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14182; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14183; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14184; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14185; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14186; GFX11-CU-NEXT:    s_endpgm
14187;
14188; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14189; GFX12-WGP:       ; %bb.0: ; %entry
14190; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14191; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14192; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14193; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14194; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14195; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14196; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14197; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14198; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14199; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14200; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14201; GFX12-WGP-NEXT:    s_endpgm
14202;
14203; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
14204; GFX12-CU:       ; %bb.0: ; %entry
14205; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14206; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14207; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14208; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14209; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14210; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14211; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14212; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14213; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14214; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14215; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14216; GFX12-CU-NEXT:    s_endpgm
14217    ptr %out, i32 %in, i32 %old) {
14218entry:
14219  %gep = getelementptr i32, ptr %out, i32 4
14220  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
14221  ret void
14222}
14223
14224define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
14225; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14226; GFX7:       ; %bb.0: ; %entry
14227; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14228; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14229; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14230; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14231; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14232; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14233; GFX7-NEXT:    s_mov_b32 s4, s8
14234; GFX7-NEXT:    s_mov_b32 s5, s9
14235; GFX7-NEXT:    s_mov_b32 s9, s10
14236; GFX7-NEXT:    s_mov_b32 s8, s11
14237; GFX7-NEXT:    s_add_u32 s4, s4, s9
14238; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14239; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14240; GFX7-NEXT:    s_mov_b32 s5, s8
14241; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14242; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14243; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14244; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14245; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14246; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14247; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14248; GFX7-NEXT:    s_endpgm
14249;
14250; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14251; GFX10-WGP:       ; %bb.0: ; %entry
14252; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14253; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14254; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14255; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14256; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14257; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14258; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14259; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14260; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14261; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14262; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14263; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14264; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14265; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14266; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14267; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14268; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14269; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14270; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14271; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14272; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14273; GFX10-WGP-NEXT:    s_endpgm
14274;
14275; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14276; GFX10-CU:       ; %bb.0: ; %entry
14277; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14278; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14279; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14280; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14281; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14282; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14283; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14284; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14285; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14286; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14287; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14288; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14289; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14290; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14291; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14292; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14293; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14294; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14296; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14297; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14298; GFX10-CU-NEXT:    s_endpgm
14299;
14300; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14301; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14302; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14303; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14304; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14305; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14306; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14307; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14309; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14310; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14311; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14312; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14313; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14314; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14315; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14316; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14317; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14318; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14319; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14320; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14321; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14322; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14323; SKIP-CACHE-INV-NEXT:    s_endpgm
14324;
14325; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14326; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14327; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14328; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14329; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14330; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14331; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14332; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14333; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14334; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14335; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14336; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14337; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14338;
14339; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14340; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14341; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14342; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14343; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14344; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14345; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14346; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14347; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14348; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14349; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14350; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14351; GFX90A-TGSPLIT-NEXT:    s_endpgm
14352;
14353; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14354; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14355; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14356; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14357; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14358; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14359; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14360; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14361; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14362; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14363; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14364; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14365; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14366;
14367; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14368; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14369; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14370; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14371; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14372; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14373; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14374; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14375; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14376; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14377; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14378; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14379; GFX940-TGSPLIT-NEXT:    s_endpgm
14380;
14381; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14382; GFX11-WGP:       ; %bb.0: ; %entry
14383; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14384; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14385; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14386; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14387; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14388; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14389; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14390; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14391; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14392; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14393; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14394; GFX11-WGP-NEXT:    s_endpgm
14395;
14396; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14397; GFX11-CU:       ; %bb.0: ; %entry
14398; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14399; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14400; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14401; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14402; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14403; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14404; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14405; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14406; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14407; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14408; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14409; GFX11-CU-NEXT:    s_endpgm
14410;
14411; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14412; GFX12-WGP:       ; %bb.0: ; %entry
14413; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14414; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14415; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14416; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14417; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14418; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14419; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14420; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14421; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14422; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14423; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14424; GFX12-WGP-NEXT:    s_endpgm
14425;
14426; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
14427; GFX12-CU:       ; %bb.0: ; %entry
14428; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14429; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14430; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14431; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14432; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14433; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14434; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14435; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14436; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14437; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14438; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14439; GFX12-CU-NEXT:    s_endpgm
14440    ptr %out, i32 %in, i32 %old) {
14441entry:
14442  %gep = getelementptr i32, ptr %out, i32 4
14443  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
14444  ret void
14445}
14446
14447define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
14448; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14449; GFX7:       ; %bb.0: ; %entry
14450; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14451; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14452; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14453; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14454; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14455; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14456; GFX7-NEXT:    s_mov_b32 s4, s8
14457; GFX7-NEXT:    s_mov_b32 s5, s9
14458; GFX7-NEXT:    s_mov_b32 s9, s10
14459; GFX7-NEXT:    s_mov_b32 s8, s11
14460; GFX7-NEXT:    s_add_u32 s4, s4, s9
14461; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14462; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14463; GFX7-NEXT:    s_mov_b32 s5, s8
14464; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14465; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14466; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14467; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14468; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14469; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14470; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14471; GFX7-NEXT:    s_endpgm
14472;
14473; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14474; GFX10-WGP:       ; %bb.0: ; %entry
14475; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14476; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14477; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14478; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14479; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14480; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14481; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14482; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14483; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14484; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14485; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14486; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14487; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14488; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14489; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14490; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14491; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14492; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14493; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14494; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14495; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14496; GFX10-WGP-NEXT:    s_endpgm
14497;
14498; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14499; GFX10-CU:       ; %bb.0: ; %entry
14500; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14501; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14502; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14503; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14504; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14505; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14506; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14507; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14508; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14509; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14510; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14511; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14512; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14513; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14514; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14515; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14516; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14517; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14518; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14519; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14520; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14521; GFX10-CU-NEXT:    s_endpgm
14522;
14523; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14524; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14525; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14526; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14527; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14528; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14529; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14530; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14531; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14532; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14533; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14534; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14535; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14536; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14537; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14541; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14543; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14544; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14545; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14546; SKIP-CACHE-INV-NEXT:    s_endpgm
14547;
14548; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14549; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14550; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14551; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14552; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14553; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14554; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14555; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14556; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14557; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14558; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14559; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14560; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14561;
14562; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14563; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14564; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14565; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14566; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14567; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14568; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14569; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14570; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14571; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14572; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14573; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14574; GFX90A-TGSPLIT-NEXT:    s_endpgm
14575;
14576; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14577; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14578; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14579; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14580; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14581; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14582; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14583; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14584; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14585; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14586; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14587; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14588; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14589;
14590; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14591; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14592; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14593; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14594; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14595; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14596; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14597; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14598; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14599; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14600; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14601; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14602; GFX940-TGSPLIT-NEXT:    s_endpgm
14603;
14604; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14605; GFX11-WGP:       ; %bb.0: ; %entry
14606; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14607; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14608; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14609; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14610; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14611; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14612; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14613; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14614; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14615; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14616; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14617; GFX11-WGP-NEXT:    s_endpgm
14618;
14619; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14620; GFX11-CU:       ; %bb.0: ; %entry
14621; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14622; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14623; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14624; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14625; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14626; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14627; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14628; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14629; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14630; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14631; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14632; GFX11-CU-NEXT:    s_endpgm
14633;
14634; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14635; GFX12-WGP:       ; %bb.0: ; %entry
14636; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14637; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14638; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14639; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14640; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14641; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14642; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14643; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14644; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14645; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14646; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14647; GFX12-WGP-NEXT:    s_endpgm
14648;
14649; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14650; GFX12-CU:       ; %bb.0: ; %entry
14651; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14652; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14653; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14654; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14655; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14656; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14657; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14658; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14659; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14660; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14661; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14662; GFX12-CU-NEXT:    s_endpgm
14663    ptr %out, i32 %in, i32 %old) {
14664entry:
14665  %gep = getelementptr i32, ptr %out, i32 4
14666  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
14667  ret void
14668}
14669
14670define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
14671; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14672; GFX7:       ; %bb.0: ; %entry
14673; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14674; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14675; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14676; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14677; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14678; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14679; GFX7-NEXT:    s_mov_b32 s4, s8
14680; GFX7-NEXT:    s_mov_b32 s5, s9
14681; GFX7-NEXT:    s_mov_b32 s9, s10
14682; GFX7-NEXT:    s_mov_b32 s8, s11
14683; GFX7-NEXT:    s_add_u32 s4, s4, s9
14684; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14685; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14686; GFX7-NEXT:    s_mov_b32 s5, s8
14687; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14688; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14689; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14690; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14691; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14692; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14693; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14694; GFX7-NEXT:    s_endpgm
14695;
14696; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14697; GFX10-WGP:       ; %bb.0: ; %entry
14698; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14699; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14700; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14701; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14702; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14703; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14704; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14705; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14706; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14707; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14708; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14709; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14710; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14711; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14712; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14713; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14714; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14715; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14716; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14717; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14718; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14719; GFX10-WGP-NEXT:    s_endpgm
14720;
14721; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14722; GFX10-CU:       ; %bb.0: ; %entry
14723; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14724; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14725; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14726; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14727; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14728; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14729; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14730; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14731; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14732; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14733; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14734; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14735; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14736; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14737; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14738; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14739; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14740; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14741; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14742; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14743; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14744; GFX10-CU-NEXT:    s_endpgm
14745;
14746; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14747; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14748; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14749; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14750; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14751; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14752; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14753; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14754; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14755; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14756; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14757; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14758; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14759; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14760; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14761; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14764; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14766; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14767; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14768; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14769; SKIP-CACHE-INV-NEXT:    s_endpgm
14770;
14771; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14772; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14774; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14775; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14776; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14777; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14778; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14779; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14780; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14781; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14782; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14783; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14784;
14785; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14786; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14787; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14788; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14789; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14790; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14791; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14792; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
14793; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14794; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14795; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14796; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14797; GFX90A-TGSPLIT-NEXT:    s_endpgm
14798;
14799; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14800; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14801; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14802; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14803; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14804; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14805; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14806; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14807; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14808; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14809; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14810; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14811; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14812;
14813; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14814; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14815; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14816; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14817; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14818; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14820; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
14821; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14822; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
14823; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
14824; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14825; GFX940-TGSPLIT-NEXT:    s_endpgm
14826;
14827; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14828; GFX11-WGP:       ; %bb.0: ; %entry
14829; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14830; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14831; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14832; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14833; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
14834; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
14835; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14836; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
14837; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
14838; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
14839; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14840; GFX11-WGP-NEXT:    s_endpgm
14841;
14842; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14843; GFX11-CU:       ; %bb.0: ; %entry
14844; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14845; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14846; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14847; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14848; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
14849; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
14850; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14851; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
14852; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
14853; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
14854; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14855; GFX11-CU-NEXT:    s_endpgm
14856;
14857; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14858; GFX12-WGP:       ; %bb.0: ; %entry
14859; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14860; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14861; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14862; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14863; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
14864; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
14865; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14866; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
14867; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
14868; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
14869; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14870; GFX12-WGP-NEXT:    s_endpgm
14871;
14872; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
14873; GFX12-CU:       ; %bb.0: ; %entry
14874; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14875; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14876; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14877; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14878; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
14879; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
14880; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14881; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
14882; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
14883; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
14884; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14885; GFX12-CU-NEXT:    s_endpgm
14886    ptr %out, i32 %in, i32 %old) {
14887entry:
14888  %gep = getelementptr i32, ptr %out, i32 4
14889  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
14890  ret void
14891}
14892
14893define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
14894; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
14895; GFX7:       ; %bb.0: ; %entry
14896; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14897; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14898; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14899; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14900; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14901; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14902; GFX7-NEXT:    s_mov_b32 s4, s8
14903; GFX7-NEXT:    s_mov_b32 s5, s9
14904; GFX7-NEXT:    s_mov_b32 s9, s10
14905; GFX7-NEXT:    s_mov_b32 s8, s11
14906; GFX7-NEXT:    s_add_u32 s4, s4, s9
14907; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14908; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14909; GFX7-NEXT:    s_mov_b32 s5, s8
14910; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14911; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14912; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14913; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14914; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14915; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14916; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14917; GFX7-NEXT:    s_endpgm
14918;
14919; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
14920; GFX10-WGP:       ; %bb.0: ; %entry
14921; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
14922; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14923; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
14924; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
14925; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
14926; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14927; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
14928; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
14929; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
14930; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
14931; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
14932; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
14933; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14934; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
14935; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
14936; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
14937; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14938; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
14939; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
14940; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
14941; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14942; GFX10-WGP-NEXT:    s_endpgm
14943;
14944; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
14945; GFX10-CU:       ; %bb.0: ; %entry
14946; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
14947; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14948; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
14949; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
14950; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
14951; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14952; GFX10-CU-NEXT:    s_mov_b32 s4, s8
14953; GFX10-CU-NEXT:    s_mov_b32 s5, s9
14954; GFX10-CU-NEXT:    s_mov_b32 s9, s10
14955; GFX10-CU-NEXT:    s_mov_b32 s8, s11
14956; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
14957; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
14958; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14959; GFX10-CU-NEXT:    s_mov_b32 s5, s8
14960; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
14961; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
14962; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14963; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
14964; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
14965; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
14966; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14967; GFX10-CU-NEXT:    s_endpgm
14968;
14969; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
14970; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14971; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
14972; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
14973; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
14974; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
14975; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
14976; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14977; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
14978; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
14979; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
14980; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
14981; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
14982; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
14983; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14984; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
14985; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
14986; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
14987; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14988; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
14989; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
14990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
14991; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14992; SKIP-CACHE-INV-NEXT:    s_endpgm
14993;
14994; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
14995; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14996; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14997; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14998; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14999; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15000; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15001; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15002; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15003; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15004; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15005; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15006; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15007;
15008; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15009; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15010; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15011; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15012; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15013; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15014; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15015; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15016; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15017; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15018; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15019; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15020; GFX90A-TGSPLIT-NEXT:    s_endpgm
15021;
15022; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15023; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15024; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15025; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15026; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15027; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15028; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15029; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15030; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15031; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15032; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15033; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15034; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15035;
15036; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15037; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15038; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15039; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15040; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15041; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15042; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15043; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15044; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15045; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15046; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15047; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15048; GFX940-TGSPLIT-NEXT:    s_endpgm
15049;
15050; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15051; GFX11-WGP:       ; %bb.0: ; %entry
15052; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15053; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15054; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15055; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15056; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15057; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15058; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15059; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15060; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15061; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15062; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15063; GFX11-WGP-NEXT:    s_endpgm
15064;
15065; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15066; GFX11-CU:       ; %bb.0: ; %entry
15067; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15068; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15069; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15070; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15071; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15072; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15073; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15074; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15075; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15076; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15077; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15078; GFX11-CU-NEXT:    s_endpgm
15079;
15080; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15081; GFX12-WGP:       ; %bb.0: ; %entry
15082; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15083; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15084; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15085; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15086; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15087; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15088; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15089; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15090; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15091; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15092; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15093; GFX12-WGP-NEXT:    s_endpgm
15094;
15095; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
15096; GFX12-CU:       ; %bb.0: ; %entry
15097; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15098; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15099; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15100; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15101; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15102; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15103; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15104; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15105; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15106; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15107; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15108; GFX12-CU-NEXT:    s_endpgm
15109    ptr %out, i32 %in, i32 %old) {
15110entry:
15111  %gep = getelementptr i32, ptr %out, i32 4
15112  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
15113  ret void
15114}
15115
15116define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
15117; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15118; GFX7:       ; %bb.0: ; %entry
15119; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15120; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15121; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15122; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15123; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15124; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15125; GFX7-NEXT:    s_mov_b32 s4, s8
15126; GFX7-NEXT:    s_mov_b32 s5, s9
15127; GFX7-NEXT:    s_mov_b32 s9, s10
15128; GFX7-NEXT:    s_mov_b32 s8, s11
15129; GFX7-NEXT:    s_add_u32 s4, s4, s9
15130; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15131; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15132; GFX7-NEXT:    s_mov_b32 s5, s8
15133; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15134; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15135; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15136; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15137; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15138; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15139; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15140; GFX7-NEXT:    s_endpgm
15141;
15142; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15143; GFX10-WGP:       ; %bb.0: ; %entry
15144; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15145; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15146; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15147; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15148; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15149; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15150; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15151; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15152; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15153; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15154; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15155; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15156; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15157; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15158; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15159; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15160; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15161; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15162; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15163; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15164; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15165; GFX10-WGP-NEXT:    s_endpgm
15166;
15167; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15168; GFX10-CU:       ; %bb.0: ; %entry
15169; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15170; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15171; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15172; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15173; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15174; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15175; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15176; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15177; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15178; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15179; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15180; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15181; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15182; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15183; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15184; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15185; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15186; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15187; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15188; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15189; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15190; GFX10-CU-NEXT:    s_endpgm
15191;
15192; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15193; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15194; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15195; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15196; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15197; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15198; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15199; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15200; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15201; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15202; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15203; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15204; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15205; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15206; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15207; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15210; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15212; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15214; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15215; SKIP-CACHE-INV-NEXT:    s_endpgm
15216;
15217; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15218; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15219; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15220; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15221; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15223; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15224; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15225; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15226; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15227; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15228; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15229; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15230;
15231; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15232; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15233; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15234; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15235; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15236; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15237; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15238; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15239; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15240; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15241; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15242; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15243; GFX90A-TGSPLIT-NEXT:    s_endpgm
15244;
15245; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15246; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15247; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15248; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15249; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15250; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15251; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15252; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15253; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15254; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15255; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15256; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15257; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15258;
15259; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15260; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15261; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15262; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15263; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15264; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15265; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15266; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15267; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15268; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15269; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15270; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15271; GFX940-TGSPLIT-NEXT:    s_endpgm
15272;
15273; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15274; GFX11-WGP:       ; %bb.0: ; %entry
15275; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15276; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15277; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15278; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15279; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15280; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15281; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15282; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15283; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15284; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15285; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15286; GFX11-WGP-NEXT:    s_endpgm
15287;
15288; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15289; GFX11-CU:       ; %bb.0: ; %entry
15290; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15291; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15292; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15293; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15294; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15295; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15296; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15297; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15298; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15299; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15300; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15301; GFX11-CU-NEXT:    s_endpgm
15302;
15303; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15304; GFX12-WGP:       ; %bb.0: ; %entry
15305; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15306; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15307; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15308; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15309; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15310; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15311; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15312; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15313; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15314; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15315; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15316; GFX12-WGP-NEXT:    s_endpgm
15317;
15318; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15319; GFX12-CU:       ; %bb.0: ; %entry
15320; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15321; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15322; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15323; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15324; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15325; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15326; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15327; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15328; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15329; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15330; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15331; GFX12-CU-NEXT:    s_endpgm
15332    ptr %out, i32 %in, i32 %old) {
15333entry:
15334  %gep = getelementptr i32, ptr %out, i32 4
15335  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
15336  ret void
15337}
15338
15339define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
15340; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15341; GFX7:       ; %bb.0: ; %entry
15342; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15343; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15344; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15345; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15346; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15347; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15348; GFX7-NEXT:    s_mov_b32 s4, s8
15349; GFX7-NEXT:    s_mov_b32 s5, s9
15350; GFX7-NEXT:    s_mov_b32 s9, s10
15351; GFX7-NEXT:    s_mov_b32 s8, s11
15352; GFX7-NEXT:    s_add_u32 s4, s4, s9
15353; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15354; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15355; GFX7-NEXT:    s_mov_b32 s5, s8
15356; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15357; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15358; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15359; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15360; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15361; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15362; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15363; GFX7-NEXT:    s_endpgm
15364;
15365; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15366; GFX10-WGP:       ; %bb.0: ; %entry
15367; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
15368; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15369; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
15370; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
15371; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
15372; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15373; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
15374; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
15375; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
15376; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
15377; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
15378; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
15379; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15380; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
15381; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
15382; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15383; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15384; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15385; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15386; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15387; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15388; GFX10-WGP-NEXT:    s_endpgm
15389;
15390; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15391; GFX10-CU:       ; %bb.0: ; %entry
15392; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
15393; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15394; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
15395; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
15396; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
15397; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15398; GFX10-CU-NEXT:    s_mov_b32 s4, s8
15399; GFX10-CU-NEXT:    s_mov_b32 s5, s9
15400; GFX10-CU-NEXT:    s_mov_b32 s9, s10
15401; GFX10-CU-NEXT:    s_mov_b32 s8, s11
15402; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
15403; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
15404; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15405; GFX10-CU-NEXT:    s_mov_b32 s5, s8
15406; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
15407; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15408; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15409; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15410; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15411; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15412; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15413; GFX10-CU-NEXT:    s_endpgm
15414;
15415; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15416; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15417; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
15418; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
15419; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
15420; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
15421; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
15422; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15423; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
15424; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
15425; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
15426; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
15427; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
15428; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
15429; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15430; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
15431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
15432; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15433; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15434; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15437; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15438; SKIP-CACHE-INV-NEXT:    s_endpgm
15439;
15440; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15441; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15442; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15443; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15444; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15445; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15446; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15447; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15448; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15449; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15450; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15451; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15452; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15453;
15454; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15455; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15456; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15457; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15458; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15459; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15460; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15461; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15462; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15463; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15464; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15465; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15466; GFX90A-TGSPLIT-NEXT:    s_endpgm
15467;
15468; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15469; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15470; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15471; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15472; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15473; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15474; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15475; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15476; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15477; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15478; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15479; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15480; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15481;
15482; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15483; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15484; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15485; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15486; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15487; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15488; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15489; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15490; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15491; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15492; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15493; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15494; GFX940-TGSPLIT-NEXT:    s_endpgm
15495;
15496; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15497; GFX11-WGP:       ; %bb.0: ; %entry
15498; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15499; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15500; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15501; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15502; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15503; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15504; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15505; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15506; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15507; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15508; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15509; GFX11-WGP-NEXT:    s_endpgm
15510;
15511; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15512; GFX11-CU:       ; %bb.0: ; %entry
15513; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15514; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15515; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15516; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15517; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15518; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15519; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15520; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15521; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15522; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15523; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15524; GFX11-CU-NEXT:    s_endpgm
15525;
15526; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15527; GFX12-WGP:       ; %bb.0: ; %entry
15528; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15529; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15530; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15531; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15532; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15533; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15534; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15535; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15536; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15537; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15538; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15539; GFX12-WGP-NEXT:    s_endpgm
15540;
15541; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15542; GFX12-CU:       ; %bb.0: ; %entry
15543; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15544; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15545; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15546; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15547; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15548; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15549; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15550; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15551; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15552; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15553; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15554; GFX12-CU-NEXT:    s_endpgm
15555    ptr %out, i32 %in, i32 %old) {
15556entry:
15557  %gep = getelementptr i32, ptr %out, i32 4
15558  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
15559  ret void
15560}
15561
15562define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
15563; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15564; GFX7:       ; %bb.0: ; %entry
15565; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
15566; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15567; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
15568; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
15569; GFX7-NEXT:    s_mov_b64 s[12:13], 16
15570; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15571; GFX7-NEXT:    s_mov_b32 s6, s4
15572; GFX7-NEXT:    s_mov_b32 s7, s5
15573; GFX7-NEXT:    s_mov_b32 s11, s12
15574; GFX7-NEXT:    s_mov_b32 s10, s13
15575; GFX7-NEXT:    s_add_u32 s6, s6, s11
15576; GFX7-NEXT:    s_addc_u32 s10, s7, s10
15577; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15578; GFX7-NEXT:    s_mov_b32 s7, s10
15579; GFX7-NEXT:    v_mov_b32_e32 v2, s9
15580; GFX7-NEXT:    v_mov_b32_e32 v0, s8
15581; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15582; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15583; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15584; GFX7-NEXT:    v_mov_b32_e32 v1, s7
15585; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15586; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15587; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15588; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15589; GFX7-NEXT:    flat_store_dword v[0:1], v2
15590; GFX7-NEXT:    s_endpgm
15591;
15592; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15593; GFX10-WGP:       ; %bb.0: ; %entry
15594; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
15595; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15596; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
15597; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
15598; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
15599; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15600; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
15601; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
15602; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
15603; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
15604; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
15605; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
15606; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15607; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
15608; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
15609; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
15610; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15611; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15612; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15613; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15614; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15615; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15616; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15617; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15618; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
15619; GFX10-WGP-NEXT:    s_endpgm
15620;
15621; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15622; GFX10-CU:       ; %bb.0: ; %entry
15623; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
15624; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15625; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
15626; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
15627; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
15628; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15629; GFX10-CU-NEXT:    s_mov_b32 s6, s4
15630; GFX10-CU-NEXT:    s_mov_b32 s7, s5
15631; GFX10-CU-NEXT:    s_mov_b32 s11, s12
15632; GFX10-CU-NEXT:    s_mov_b32 s10, s13
15633; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
15634; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
15635; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15636; GFX10-CU-NEXT:    s_mov_b32 s7, s10
15637; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
15638; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
15639; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15640; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15641; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15642; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15643; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15644; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15645; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15646; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15647; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
15648; GFX10-CU-NEXT:    s_endpgm
15649;
15650; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15651; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15652; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15653; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15654; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15655; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15656; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
15657; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15658; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
15659; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
15660; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
15661; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
15662; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
15663; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
15664; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
15665; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15666; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
15667; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
15668; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15669; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15670; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15671; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
15672; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15673; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15674; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15675; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15676; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
15677; SKIP-CACHE-INV-NEXT:    s_endpgm
15678;
15679; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15680; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15681; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15682; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15683; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15684; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15685; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15686; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15687; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15688; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15689; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15690; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
15691; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15692; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15693; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
15694; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15695;
15696; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15697; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15698; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15699; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15700; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15701; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15702; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15703; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15704; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15705; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15706; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15707; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
15708; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15709; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15710; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
15711; GFX90A-TGSPLIT-NEXT:    s_endpgm
15712;
15713; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15714; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15715; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15716; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15717; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15718; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15719; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15720; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15721; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15722; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15723; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15724; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
15725; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15726; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15727; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
15728; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15729;
15730; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15731; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15732; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15733; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15734; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15735; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15736; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15737; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15738; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15739; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15740; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15741; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
15742; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15743; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15744; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
15745; GFX940-TGSPLIT-NEXT:    s_endpgm
15746;
15747; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15748; GFX11-WGP:       ; %bb.0: ; %entry
15749; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15750; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15751; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15752; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15753; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
15754; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
15755; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15756; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
15757; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15758; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15759; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
15760; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
15761; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
15762; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15763; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
15764; GFX11-WGP-NEXT:    s_endpgm
15765;
15766; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15767; GFX11-CU:       ; %bb.0: ; %entry
15768; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15769; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15770; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15771; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15772; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
15773; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
15774; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15775; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
15776; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15777; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15778; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
15779; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
15780; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
15781; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15782; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
15783; GFX11-CU-NEXT:    s_endpgm
15784;
15785; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15786; GFX12-WGP:       ; %bb.0: ; %entry
15787; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15788; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15789; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15790; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15791; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
15792; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
15793; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15794; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
15795; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15796; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15797; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
15798; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
15799; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
15800; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
15801; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
15802; GFX12-WGP-NEXT:    s_endpgm
15803;
15804; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15805; GFX12-CU:       ; %bb.0: ; %entry
15806; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15807; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15808; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15809; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15810; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
15811; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
15812; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15813; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
15814; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15815; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15816; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
15817; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
15818; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
15819; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
15820; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
15821; GFX12-CU-NEXT:    s_endpgm
15822    ptr %out, i32 %in, i32 %old) {
15823entry:
15824  %gep = getelementptr i32, ptr %out, i32 4
15825  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
15826  %val0 = extractvalue { i32, i1 } %val, 0
15827  store i32 %val0, ptr %out, align 4
15828  ret void
15829}
15830
15831define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
15832; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15833; GFX7:       ; %bb.0: ; %entry
15834; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
15835; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15836; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
15837; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
15838; GFX7-NEXT:    s_mov_b64 s[12:13], 16
15839; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15840; GFX7-NEXT:    s_mov_b32 s6, s4
15841; GFX7-NEXT:    s_mov_b32 s7, s5
15842; GFX7-NEXT:    s_mov_b32 s11, s12
15843; GFX7-NEXT:    s_mov_b32 s10, s13
15844; GFX7-NEXT:    s_add_u32 s6, s6, s11
15845; GFX7-NEXT:    s_addc_u32 s10, s7, s10
15846; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15847; GFX7-NEXT:    s_mov_b32 s7, s10
15848; GFX7-NEXT:    v_mov_b32_e32 v2, s9
15849; GFX7-NEXT:    v_mov_b32_e32 v0, s8
15850; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15851; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15852; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15853; GFX7-NEXT:    v_mov_b32_e32 v1, s7
15854; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15855; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15856; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15857; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15858; GFX7-NEXT:    flat_store_dword v[0:1], v2
15859; GFX7-NEXT:    s_endpgm
15860;
15861; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15862; GFX10-WGP:       ; %bb.0: ; %entry
15863; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
15864; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15865; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
15866; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
15867; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
15868; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15869; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
15870; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
15871; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
15872; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
15873; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
15874; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
15875; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15876; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
15877; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
15878; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
15879; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15880; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
15881; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
15882; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15883; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15884; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
15885; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
15886; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15887; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
15888; GFX10-WGP-NEXT:    s_endpgm
15889;
15890; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15891; GFX10-CU:       ; %bb.0: ; %entry
15892; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
15893; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15894; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
15895; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
15896; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
15897; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15898; GFX10-CU-NEXT:    s_mov_b32 s6, s4
15899; GFX10-CU-NEXT:    s_mov_b32 s7, s5
15900; GFX10-CU-NEXT:    s_mov_b32 s11, s12
15901; GFX10-CU-NEXT:    s_mov_b32 s10, s13
15902; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
15903; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
15904; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15905; GFX10-CU-NEXT:    s_mov_b32 s7, s10
15906; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
15907; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
15908; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15909; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
15910; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
15911; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15912; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15913; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
15914; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
15915; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15916; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
15917; GFX10-CU-NEXT:    s_endpgm
15918;
15919; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15920; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15921; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15922; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15923; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15924; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15925; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
15926; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15927; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
15928; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
15929; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
15930; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
15931; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
15932; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
15933; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
15934; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15935; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
15936; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
15937; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15938; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
15939; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
15940; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
15941; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
15943; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
15944; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15945; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
15946; SKIP-CACHE-INV-NEXT:    s_endpgm
15947;
15948; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15949; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15950; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15951; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15952; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15953; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15954; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15955; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15956; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15957; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15958; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15959; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
15960; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15961; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15962; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
15963; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15964;
15965; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15966; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15968; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15969; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15970; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15971; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15972; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
15973; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15974; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15975; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15976; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
15977; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15978; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15979; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
15980; GFX90A-TGSPLIT-NEXT:    s_endpgm
15981;
15982; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15983; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15984; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15985; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15986; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15987; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15988; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15989; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
15990; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15991; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
15992; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15993; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
15994; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
15995; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15996; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
15997; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15998;
15999; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
16000; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16001; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16002; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16003; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16004; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16005; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16006; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16007; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16008; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16009; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16010; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16011; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16012; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16013; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16014; GFX940-TGSPLIT-NEXT:    s_endpgm
16015;
16016; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
16017; GFX11-WGP:       ; %bb.0: ; %entry
16018; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16019; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16020; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16021; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16022; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16023; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16024; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16025; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16026; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16027; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16028; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16029; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16030; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16031; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16032; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16033; GFX11-WGP-NEXT:    s_endpgm
16034;
16035; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
16036; GFX11-CU:       ; %bb.0: ; %entry
16037; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16038; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16039; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16040; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16041; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16042; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16043; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16044; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16045; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16046; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16047; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16048; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16049; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16050; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16051; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16052; GFX11-CU-NEXT:    s_endpgm
16053;
16054; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
16055; GFX12-WGP:       ; %bb.0: ; %entry
16056; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16057; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16058; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16059; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16060; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16061; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16062; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16063; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16064; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16065; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16066; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16067; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16068; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16069; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16070; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16071; GFX12-WGP-NEXT:    s_endpgm
16072;
16073; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
16074; GFX12-CU:       ; %bb.0: ; %entry
16075; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16076; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16077; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16078; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16079; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16080; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16081; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16082; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16083; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16084; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16085; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16086; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16087; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16088; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16089; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16090; GFX12-CU-NEXT:    s_endpgm
16091    ptr %out, i32 %in, i32 %old) {
16092entry:
16093  %gep = getelementptr i32, ptr %out, i32 4
16094  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
16095  %val0 = extractvalue { i32, i1 } %val, 0
16096  store i32 %val0, ptr %out, align 4
16097  ret void
16098}
16099
16100define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
16101; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16102; GFX7:       ; %bb.0: ; %entry
16103; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16104; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16105; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16106; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16107; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16108; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16109; GFX7-NEXT:    s_mov_b32 s6, s4
16110; GFX7-NEXT:    s_mov_b32 s7, s5
16111; GFX7-NEXT:    s_mov_b32 s11, s12
16112; GFX7-NEXT:    s_mov_b32 s10, s13
16113; GFX7-NEXT:    s_add_u32 s6, s6, s11
16114; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16115; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16116; GFX7-NEXT:    s_mov_b32 s7, s10
16117; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16118; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16119; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16120; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16121; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16122; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16123; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16124; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16125; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16126; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16127; GFX7-NEXT:    flat_store_dword v[0:1], v2
16128; GFX7-NEXT:    s_endpgm
16129;
16130; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16131; GFX10-WGP:       ; %bb.0: ; %entry
16132; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16133; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16134; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16135; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16136; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16137; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16138; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16139; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16140; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16141; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16142; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16143; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16144; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16145; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16146; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16147; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16148; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16149; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16150; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16151; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16152; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16153; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16154; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16155; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16156; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16157; GFX10-WGP-NEXT:    s_endpgm
16158;
16159; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16160; GFX10-CU:       ; %bb.0: ; %entry
16161; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16162; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16163; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16164; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16165; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16166; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16167; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16168; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16169; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16170; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16171; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16172; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16173; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16174; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16175; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16176; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16177; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16178; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16179; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16180; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16181; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16182; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16183; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16184; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16185; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16186; GFX10-CU-NEXT:    s_endpgm
16187;
16188; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16189; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16190; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16191; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16192; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16193; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16194; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16195; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16196; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16197; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16198; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16199; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16200; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16201; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16202; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16203; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16206; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16207; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16208; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16209; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16210; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16211; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16212; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16213; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16214; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16215; SKIP-CACHE-INV-NEXT:    s_endpgm
16216;
16217; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16218; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16219; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16220; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16221; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16222; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16223; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16224; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16225; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16226; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16227; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16228; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16229; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16230; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16231; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16232; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16233;
16234; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16235; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16236; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16237; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16238; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16239; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16240; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16241; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16242; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16243; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16244; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16245; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16246; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16247; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16248; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16249; GFX90A-TGSPLIT-NEXT:    s_endpgm
16250;
16251; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16252; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16253; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16254; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16255; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16256; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16257; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16258; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16259; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16260; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16261; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16262; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16263; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16264; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16265; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16266; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16267;
16268; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16269; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16270; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16271; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16272; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16273; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16274; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16275; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16276; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16277; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16278; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16279; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16280; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16281; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16282; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16283; GFX940-TGSPLIT-NEXT:    s_endpgm
16284;
16285; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16286; GFX11-WGP:       ; %bb.0: ; %entry
16287; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16288; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16289; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16290; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16291; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16292; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16293; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16294; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16295; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16296; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16297; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16298; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16299; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16300; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16301; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16302; GFX11-WGP-NEXT:    s_endpgm
16303;
16304; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16305; GFX11-CU:       ; %bb.0: ; %entry
16306; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16307; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16308; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16309; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16310; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16311; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16312; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16313; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16314; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16315; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16316; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16317; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16318; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16319; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16320; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16321; GFX11-CU-NEXT:    s_endpgm
16322;
16323; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16324; GFX12-WGP:       ; %bb.0: ; %entry
16325; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16326; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16327; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16328; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16329; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16330; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16331; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16332; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16333; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16334; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16335; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16336; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16337; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16338; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16339; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16340; GFX12-WGP-NEXT:    s_endpgm
16341;
16342; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16343; GFX12-CU:       ; %bb.0: ; %entry
16344; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16345; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16346; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16347; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16348; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16349; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16350; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16351; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16352; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16353; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16354; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16355; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16356; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16357; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16358; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16359; GFX12-CU-NEXT:    s_endpgm
16360    ptr %out, i32 %in, i32 %old) {
16361entry:
16362  %gep = getelementptr i32, ptr %out, i32 4
16363  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
16364  %val0 = extractvalue { i32, i1 } %val, 0
16365  store i32 %val0, ptr %out, align 4
16366  ret void
16367}
16368
16369define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
16370; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16371; GFX7:       ; %bb.0: ; %entry
16372; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16373; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16374; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16375; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16376; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16377; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16378; GFX7-NEXT:    s_mov_b32 s6, s4
16379; GFX7-NEXT:    s_mov_b32 s7, s5
16380; GFX7-NEXT:    s_mov_b32 s11, s12
16381; GFX7-NEXT:    s_mov_b32 s10, s13
16382; GFX7-NEXT:    s_add_u32 s6, s6, s11
16383; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16384; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16385; GFX7-NEXT:    s_mov_b32 s7, s10
16386; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16387; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16388; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16389; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16390; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16391; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16392; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16393; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16394; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16395; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16396; GFX7-NEXT:    flat_store_dword v[0:1], v2
16397; GFX7-NEXT:    s_endpgm
16398;
16399; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16400; GFX10-WGP:       ; %bb.0: ; %entry
16401; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16402; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16403; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16404; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16405; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16406; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16407; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16408; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16409; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16410; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16411; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16412; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16413; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16414; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16415; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16417; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16418; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16419; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16420; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16421; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16422; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16423; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16424; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16425; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16426; GFX10-WGP-NEXT:    s_endpgm
16427;
16428; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16429; GFX10-CU:       ; %bb.0: ; %entry
16430; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16431; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16432; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16433; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16434; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16435; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16436; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16437; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16438; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16439; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16440; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16441; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16442; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16443; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16444; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16445; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16446; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16447; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16448; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16449; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16450; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16451; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16452; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16453; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16454; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16455; GFX10-CU-NEXT:    s_endpgm
16456;
16457; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16458; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16459; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16460; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16461; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16462; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16463; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16464; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16465; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16466; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16467; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16468; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16469; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16470; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16471; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16472; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16473; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16475; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16476; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16477; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16478; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16479; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16480; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16481; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16482; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16483; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16484; SKIP-CACHE-INV-NEXT:    s_endpgm
16485;
16486; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16487; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16488; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16489; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16490; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16491; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16492; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16493; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16494; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16495; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16496; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16497; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16498; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16500; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16501; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16502;
16503; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16504; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16505; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16506; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16507; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16508; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16509; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16511; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16512; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16513; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16514; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16515; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16516; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16517; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16518; GFX90A-TGSPLIT-NEXT:    s_endpgm
16519;
16520; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16521; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16522; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16523; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16524; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16525; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16526; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16527; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16528; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16529; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16530; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16531; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16532; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16533; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16534; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16535; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16536;
16537; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16538; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16539; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16540; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16541; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16542; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16543; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16544; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16545; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16546; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16547; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16548; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16549; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16550; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16551; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16552; GFX940-TGSPLIT-NEXT:    s_endpgm
16553;
16554; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16555; GFX11-WGP:       ; %bb.0: ; %entry
16556; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16557; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16558; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16559; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16560; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16561; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16562; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16563; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16564; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16565; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16566; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16567; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16568; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16569; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16570; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16571; GFX11-WGP-NEXT:    s_endpgm
16572;
16573; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16574; GFX11-CU:       ; %bb.0: ; %entry
16575; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16576; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16577; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16578; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16579; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16580; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16581; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16582; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16583; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16584; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16585; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16586; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16587; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16588; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16589; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16590; GFX11-CU-NEXT:    s_endpgm
16591;
16592; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16593; GFX12-WGP:       ; %bb.0: ; %entry
16594; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16595; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16596; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16597; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16598; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16599; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16600; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16601; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16602; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16603; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16604; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16605; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16606; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16607; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16608; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16609; GFX12-WGP-NEXT:    s_endpgm
16610;
16611; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16612; GFX12-CU:       ; %bb.0: ; %entry
16613; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16614; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16615; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16616; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16617; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16618; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16619; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16620; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16621; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16622; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16623; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16624; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16625; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16626; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16627; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16628; GFX12-CU-NEXT:    s_endpgm
16629    ptr %out, i32 %in, i32 %old) {
16630entry:
16631  %gep = getelementptr i32, ptr %out, i32 4
16632  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
16633  %val0 = extractvalue { i32, i1 } %val, 0
16634  store i32 %val0, ptr %out, align 4
16635  ret void
16636}
16637
16638define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
16639; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16640; GFX7:       ; %bb.0: ; %entry
16641; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16642; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16643; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16644; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16645; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16646; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16647; GFX7-NEXT:    s_mov_b32 s6, s4
16648; GFX7-NEXT:    s_mov_b32 s7, s5
16649; GFX7-NEXT:    s_mov_b32 s11, s12
16650; GFX7-NEXT:    s_mov_b32 s10, s13
16651; GFX7-NEXT:    s_add_u32 s6, s6, s11
16652; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16653; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16654; GFX7-NEXT:    s_mov_b32 s7, s10
16655; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16656; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16657; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16658; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16659; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16660; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16661; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16662; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16663; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16664; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16665; GFX7-NEXT:    flat_store_dword v[0:1], v2
16666; GFX7-NEXT:    s_endpgm
16667;
16668; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16669; GFX10-WGP:       ; %bb.0: ; %entry
16670; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16671; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16672; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16673; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16674; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16675; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16676; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16677; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16678; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16679; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16680; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16681; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16682; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16683; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16684; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16685; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16686; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16687; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16688; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16689; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16690; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16691; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16692; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16693; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16694; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16695; GFX10-WGP-NEXT:    s_endpgm
16696;
16697; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16698; GFX10-CU:       ; %bb.0: ; %entry
16699; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16700; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16701; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16702; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16703; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16704; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16705; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16706; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16707; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16708; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16709; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16710; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16711; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16712; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16713; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16714; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16715; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16716; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16717; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16718; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16719; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16720; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16721; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16722; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16723; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16724; GFX10-CU-NEXT:    s_endpgm
16725;
16726; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16727; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16728; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16729; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16730; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16731; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16732; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
16733; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16734; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
16735; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
16736; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
16737; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
16738; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
16739; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
16740; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16741; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16742; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
16743; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
16744; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16745; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
16746; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
16747; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
16748; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16749; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
16750; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
16751; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16752; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
16753; SKIP-CACHE-INV-NEXT:    s_endpgm
16754;
16755; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16756; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16757; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16758; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16759; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16760; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16761; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16762; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16763; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16764; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16765; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16766; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16767; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16768; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16769; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16770; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16771;
16772; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16773; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16774; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16775; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16776; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16777; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16778; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16779; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
16780; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16781; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16782; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16783; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16784; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16785; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16786; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
16787; GFX90A-TGSPLIT-NEXT:    s_endpgm
16788;
16789; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16790; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16791; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16792; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16793; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16794; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16795; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16796; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16797; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16798; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16799; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16800; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16801; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16802; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16803; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16804; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16805;
16806; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16807; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16808; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16809; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16810; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16811; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16812; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16813; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
16814; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16815; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
16816; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16817; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16818; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
16819; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16820; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
16821; GFX940-TGSPLIT-NEXT:    s_endpgm
16822;
16823; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16824; GFX11-WGP:       ; %bb.0: ; %entry
16825; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16826; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16827; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16828; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16829; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
16830; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
16831; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16832; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
16833; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16834; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16835; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16836; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
16837; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
16838; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16839; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
16840; GFX11-WGP-NEXT:    s_endpgm
16841;
16842; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16843; GFX11-CU:       ; %bb.0: ; %entry
16844; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16845; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16846; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16847; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16848; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
16849; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
16850; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16851; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
16852; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16853; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16854; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16855; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
16856; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
16857; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16858; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
16859; GFX11-CU-NEXT:    s_endpgm
16860;
16861; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16862; GFX12-WGP:       ; %bb.0: ; %entry
16863; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16864; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16865; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16866; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16867; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
16868; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
16869; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16870; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
16871; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16872; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16873; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16874; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
16875; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
16876; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
16877; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
16878; GFX12-WGP-NEXT:    s_endpgm
16879;
16880; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16881; GFX12-CU:       ; %bb.0: ; %entry
16882; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16883; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16884; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16885; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16886; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
16887; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
16888; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16889; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
16890; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16891; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16892; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16893; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
16894; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
16895; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
16896; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
16897; GFX12-CU-NEXT:    s_endpgm
16898    ptr %out, i32 %in, i32 %old) {
16899entry:
16900  %gep = getelementptr i32, ptr %out, i32 4
16901  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
16902  %val0 = extractvalue { i32, i1 } %val, 0
16903  store i32 %val0, ptr %out, align 4
16904  ret void
16905}
16906
16907define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
16908; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16909; GFX7:       ; %bb.0: ; %entry
16910; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16911; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16912; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16913; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16914; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16915; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16916; GFX7-NEXT:    s_mov_b32 s6, s4
16917; GFX7-NEXT:    s_mov_b32 s7, s5
16918; GFX7-NEXT:    s_mov_b32 s11, s12
16919; GFX7-NEXT:    s_mov_b32 s10, s13
16920; GFX7-NEXT:    s_add_u32 s6, s6, s11
16921; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16922; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16923; GFX7-NEXT:    s_mov_b32 s7, s10
16924; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16925; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16926; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16927; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16928; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16929; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16930; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16931; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16932; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16933; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16934; GFX7-NEXT:    flat_store_dword v[0:1], v2
16935; GFX7-NEXT:    s_endpgm
16936;
16937; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16938; GFX10-WGP:       ; %bb.0: ; %entry
16939; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
16940; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16941; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
16942; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
16943; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
16944; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16945; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
16946; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
16947; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
16948; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
16949; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
16950; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
16951; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16952; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
16953; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
16954; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
16955; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16956; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
16957; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
16958; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16959; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16960; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
16961; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
16962; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16963; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
16964; GFX10-WGP-NEXT:    s_endpgm
16965;
16966; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16967; GFX10-CU:       ; %bb.0: ; %entry
16968; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
16969; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16970; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
16971; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
16972; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
16973; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16974; GFX10-CU-NEXT:    s_mov_b32 s6, s4
16975; GFX10-CU-NEXT:    s_mov_b32 s7, s5
16976; GFX10-CU-NEXT:    s_mov_b32 s11, s12
16977; GFX10-CU-NEXT:    s_mov_b32 s10, s13
16978; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
16979; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
16980; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16981; GFX10-CU-NEXT:    s_mov_b32 s7, s10
16982; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
16983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
16984; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16985; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
16986; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
16987; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16988; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16989; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
16990; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
16991; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16992; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
16993; GFX10-CU-NEXT:    s_endpgm
16994;
16995; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16996; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16997; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16998; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16999; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17000; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17001; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17002; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17003; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17005; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17006; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17007; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17008; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17009; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17010; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17011; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17012; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17013; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17014; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17015; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17016; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17017; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17018; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17019; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17020; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17021; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17022; SKIP-CACHE-INV-NEXT:    s_endpgm
17023;
17024; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17025; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17026; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17027; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17028; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17029; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17030; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17031; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17032; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17033; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17034; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17035; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17036; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17037; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17038; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17039; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17040;
17041; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17042; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17043; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17044; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17045; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17046; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17047; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17048; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17049; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17050; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17051; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17052; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17053; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17054; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17055; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17056; GFX90A-TGSPLIT-NEXT:    s_endpgm
17057;
17058; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17059; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17060; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17061; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17062; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17063; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17064; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17065; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17066; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17067; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17068; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17069; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17070; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17071; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17072; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17073; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17074;
17075; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17076; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17077; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17078; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17079; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17080; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17081; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17082; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17083; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17084; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17085; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17086; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17087; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17088; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17089; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17090; GFX940-TGSPLIT-NEXT:    s_endpgm
17091;
17092; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17093; GFX11-WGP:       ; %bb.0: ; %entry
17094; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17095; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17096; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17097; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17098; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17099; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17100; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17101; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17102; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17103; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17104; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17105; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17106; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17107; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17108; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17109; GFX11-WGP-NEXT:    s_endpgm
17110;
17111; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17112; GFX11-CU:       ; %bb.0: ; %entry
17113; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17114; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17115; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17116; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17117; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17118; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17119; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17120; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17121; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17122; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17123; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17124; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17125; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17126; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17127; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17128; GFX11-CU-NEXT:    s_endpgm
17129;
17130; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17131; GFX12-WGP:       ; %bb.0: ; %entry
17132; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17133; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17134; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17135; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17136; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17137; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17138; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17139; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17140; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17141; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17142; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17143; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17144; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17145; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
17146; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17147; GFX12-WGP-NEXT:    s_endpgm
17148;
17149; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17150; GFX12-CU:       ; %bb.0: ; %entry
17151; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17152; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17153; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17154; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17155; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17156; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17157; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17158; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17159; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17160; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17161; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17162; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17163; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17164; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17165; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17166; GFX12-CU-NEXT:    s_endpgm
17167    ptr %out, i32 %in, i32 %old) {
17168entry:
17169  %gep = getelementptr i32, ptr %out, i32 4
17170  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
17171  %val0 = extractvalue { i32, i1 } %val, 0
17172  store i32 %val0, ptr %out, align 4
17173  ret void
17174}
17175
17176define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
17177; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17178; GFX7:       ; %bb.0: ; %entry
17179; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17180; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17181; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17182; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17183; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17184; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17185; GFX7-NEXT:    s_mov_b32 s6, s4
17186; GFX7-NEXT:    s_mov_b32 s7, s5
17187; GFX7-NEXT:    s_mov_b32 s11, s12
17188; GFX7-NEXT:    s_mov_b32 s10, s13
17189; GFX7-NEXT:    s_add_u32 s6, s6, s11
17190; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17191; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17192; GFX7-NEXT:    s_mov_b32 s7, s10
17193; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17194; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17195; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17196; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17197; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17198; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17199; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17200; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17201; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17202; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17203; GFX7-NEXT:    flat_store_dword v[0:1], v2
17204; GFX7-NEXT:    s_endpgm
17205;
17206; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17207; GFX10-WGP:       ; %bb.0: ; %entry
17208; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17209; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17210; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17211; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17212; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17213; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17214; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17215; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17216; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17217; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17218; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17219; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17220; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17221; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17222; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17223; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17224; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17225; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17226; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17227; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17228; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17229; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17230; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17231; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17232; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17233; GFX10-WGP-NEXT:    s_endpgm
17234;
17235; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17236; GFX10-CU:       ; %bb.0: ; %entry
17237; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17238; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17239; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17240; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17241; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17242; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17243; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17244; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17245; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17246; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17247; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17248; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17249; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17250; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17251; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17252; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17253; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17254; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17255; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17256; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17257; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17258; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17259; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17260; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17261; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17262; GFX10-CU-NEXT:    s_endpgm
17263;
17264; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17265; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17266; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17267; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17268; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17269; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17270; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17271; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17272; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17273; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17274; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17275; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17276; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17277; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17278; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17279; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17280; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17281; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17282; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17283; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17285; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17286; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17287; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17288; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17289; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17290; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17291; SKIP-CACHE-INV-NEXT:    s_endpgm
17292;
17293; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17294; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17295; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17296; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17297; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17298; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17299; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17300; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17301; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17302; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17303; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17304; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17305; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17306; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17307; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17308; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17309;
17310; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17311; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17312; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17313; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17314; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17315; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17316; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17317; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17318; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17319; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17320; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17321; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17322; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17323; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17324; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17325; GFX90A-TGSPLIT-NEXT:    s_endpgm
17326;
17327; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17328; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17329; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17330; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17331; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17332; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17333; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17334; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17335; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17336; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17337; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17338; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17339; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17340; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17341; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17342; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17343;
17344; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17345; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17346; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17347; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17348; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17349; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17350; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17351; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17352; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17353; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17354; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17355; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17356; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17357; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17358; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17359; GFX940-TGSPLIT-NEXT:    s_endpgm
17360;
17361; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17362; GFX11-WGP:       ; %bb.0: ; %entry
17363; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17364; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17365; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17366; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17367; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17368; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17369; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17370; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17371; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17372; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17373; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17374; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17375; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17376; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17377; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17378; GFX11-WGP-NEXT:    s_endpgm
17379;
17380; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17381; GFX11-CU:       ; %bb.0: ; %entry
17382; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17383; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17384; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17385; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17386; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17387; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17388; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17389; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17390; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17391; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17392; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17393; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17394; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17395; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17396; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17397; GFX11-CU-NEXT:    s_endpgm
17398;
17399; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17400; GFX12-WGP:       ; %bb.0: ; %entry
17401; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17402; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17403; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17404; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17405; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17406; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17407; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17408; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17409; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17410; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17411; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17412; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17413; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17414; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
17415; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17416; GFX12-WGP-NEXT:    s_endpgm
17417;
17418; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
17419; GFX12-CU:       ; %bb.0: ; %entry
17420; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17421; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17422; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17423; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17424; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17425; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17426; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17427; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17428; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17429; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17430; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17431; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17432; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17433; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17434; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17435; GFX12-CU-NEXT:    s_endpgm
17436    ptr %out, i32 %in, i32 %old) {
17437entry:
17438  %gep = getelementptr i32, ptr %out, i32 4
17439  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
17440  %val0 = extractvalue { i32, i1 } %val, 0
17441  store i32 %val0, ptr %out, align 4
17442  ret void
17443}
17444
17445define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
17446; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17447; GFX7:       ; %bb.0: ; %entry
17448; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17449; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17450; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17451; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17452; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17454; GFX7-NEXT:    s_mov_b32 s6, s4
17455; GFX7-NEXT:    s_mov_b32 s7, s5
17456; GFX7-NEXT:    s_mov_b32 s11, s12
17457; GFX7-NEXT:    s_mov_b32 s10, s13
17458; GFX7-NEXT:    s_add_u32 s6, s6, s11
17459; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17460; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17461; GFX7-NEXT:    s_mov_b32 s7, s10
17462; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17463; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17464; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17465; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17466; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17467; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17468; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17469; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17470; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17471; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17472; GFX7-NEXT:    flat_store_dword v[0:1], v2
17473; GFX7-NEXT:    s_endpgm
17474;
17475; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17476; GFX10-WGP:       ; %bb.0: ; %entry
17477; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17478; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17479; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17480; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17481; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17482; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17483; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17484; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17485; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17486; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17487; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17488; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17489; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17490; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17491; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17492; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17493; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17494; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17495; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17496; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17497; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17498; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17499; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17500; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17501; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17502; GFX10-WGP-NEXT:    s_endpgm
17503;
17504; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17505; GFX10-CU:       ; %bb.0: ; %entry
17506; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17507; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17508; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17509; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17510; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17511; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17512; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17513; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17514; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17515; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17516; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17517; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17518; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17519; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17520; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17521; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17522; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17523; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17524; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17525; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17526; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17527; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17528; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17529; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17530; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17531; GFX10-CU-NEXT:    s_endpgm
17532;
17533; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17534; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17535; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17536; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17537; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17538; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17539; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17540; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17541; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17542; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17543; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17544; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17545; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17546; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17547; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17548; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17549; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17550; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17551; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17552; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17553; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17554; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17555; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17557; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17558; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17559; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17560; SKIP-CACHE-INV-NEXT:    s_endpgm
17561;
17562; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17563; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17564; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17565; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17566; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17567; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17568; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17569; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17570; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17571; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17572; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17573; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17574; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17575; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17576; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17577; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17578;
17579; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17580; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17581; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17582; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17583; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17584; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17585; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17586; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17587; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17588; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17589; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17590; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17591; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17592; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17593; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17594; GFX90A-TGSPLIT-NEXT:    s_endpgm
17595;
17596; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17597; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17598; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17599; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17600; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17601; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17602; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17603; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17604; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17605; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17606; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17607; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17608; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17609; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17610; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17611; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17612;
17613; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17614; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17615; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17616; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17617; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17618; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17619; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17620; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17621; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17622; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17623; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17624; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17625; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17626; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17627; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17628; GFX940-TGSPLIT-NEXT:    s_endpgm
17629;
17630; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17631; GFX11-WGP:       ; %bb.0: ; %entry
17632; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17633; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17634; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17635; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17636; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17637; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17638; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17639; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17640; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17641; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17642; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17643; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17644; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17645; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17646; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17647; GFX11-WGP-NEXT:    s_endpgm
17648;
17649; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17650; GFX11-CU:       ; %bb.0: ; %entry
17651; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17652; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17653; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17654; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17655; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17656; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17657; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17658; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17659; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17660; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17661; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17662; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17663; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17664; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17665; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17666; GFX11-CU-NEXT:    s_endpgm
17667;
17668; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17669; GFX12-WGP:       ; %bb.0: ; %entry
17670; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17671; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17672; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17673; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17674; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17675; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17676; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17677; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17678; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17679; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17680; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17681; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17682; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17683; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
17684; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17685; GFX12-WGP-NEXT:    s_endpgm
17686;
17687; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17688; GFX12-CU:       ; %bb.0: ; %entry
17689; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17690; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17691; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17692; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17693; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17694; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17695; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17696; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17697; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17698; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17699; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17700; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17701; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17702; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17703; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17704; GFX12-CU-NEXT:    s_endpgm
17705    ptr %out, i32 %in, i32 %old) {
17706entry:
17707  %gep = getelementptr i32, ptr %out, i32 4
17708  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
17709  %val0 = extractvalue { i32, i1 } %val, 0
17710  store i32 %val0, ptr %out, align 4
17711  ret void
17712}
17713
17714define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
17715; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17716; GFX7:       ; %bb.0: ; %entry
17717; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17718; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17719; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17720; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17721; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17722; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17723; GFX7-NEXT:    s_mov_b32 s6, s4
17724; GFX7-NEXT:    s_mov_b32 s7, s5
17725; GFX7-NEXT:    s_mov_b32 s11, s12
17726; GFX7-NEXT:    s_mov_b32 s10, s13
17727; GFX7-NEXT:    s_add_u32 s6, s6, s11
17728; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17729; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17730; GFX7-NEXT:    s_mov_b32 s7, s10
17731; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17732; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17733; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17734; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17735; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17736; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17737; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17738; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17739; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17740; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17741; GFX7-NEXT:    flat_store_dword v[0:1], v2
17742; GFX7-NEXT:    s_endpgm
17743;
17744; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17745; GFX10-WGP:       ; %bb.0: ; %entry
17746; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
17747; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17748; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
17749; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
17750; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
17751; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17752; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
17753; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
17754; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
17755; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
17756; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
17757; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
17758; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17759; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
17760; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
17761; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
17762; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17763; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
17764; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
17765; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17766; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17767; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
17768; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
17769; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17770; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
17771; GFX10-WGP-NEXT:    s_endpgm
17772;
17773; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17774; GFX10-CU:       ; %bb.0: ; %entry
17775; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
17776; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17777; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
17778; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
17779; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
17780; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17781; GFX10-CU-NEXT:    s_mov_b32 s6, s4
17782; GFX10-CU-NEXT:    s_mov_b32 s7, s5
17783; GFX10-CU-NEXT:    s_mov_b32 s11, s12
17784; GFX10-CU-NEXT:    s_mov_b32 s10, s13
17785; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
17786; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
17787; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17788; GFX10-CU-NEXT:    s_mov_b32 s7, s10
17789; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
17790; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
17791; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17792; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
17793; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
17794; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17795; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17796; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
17797; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
17798; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17799; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
17800; GFX10-CU-NEXT:    s_endpgm
17801;
17802; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17803; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17804; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17805; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17806; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17807; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17808; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
17809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17810; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
17811; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
17812; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
17813; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
17814; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
17815; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
17816; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17817; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17818; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
17819; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
17820; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17821; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
17822; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
17823; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
17824; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17825; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
17826; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
17827; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17828; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
17829; SKIP-CACHE-INV-NEXT:    s_endpgm
17830;
17831; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17832; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17833; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17834; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17835; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17836; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17837; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17838; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17839; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17840; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17841; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17842; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17843; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17844; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17845; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17846; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17847;
17848; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17849; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17850; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17851; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17852; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17853; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17854; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17855; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
17856; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17857; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17858; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17859; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17860; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17861; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17862; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
17863; GFX90A-TGSPLIT-NEXT:    s_endpgm
17864;
17865; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17866; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17867; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17868; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17869; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17870; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17871; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17872; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17873; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17874; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17875; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17876; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17877; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17878; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17879; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17880; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17881;
17882; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17883; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17884; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17885; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17886; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17887; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17888; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17889; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
17890; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17891; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
17892; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17893; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17894; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
17895; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17896; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
17897; GFX940-TGSPLIT-NEXT:    s_endpgm
17898;
17899; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17900; GFX11-WGP:       ; %bb.0: ; %entry
17901; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17902; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17903; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17904; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17905; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
17906; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
17907; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17908; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
17909; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17910; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17911; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17912; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
17913; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
17914; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17915; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
17916; GFX11-WGP-NEXT:    s_endpgm
17917;
17918; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17919; GFX11-CU:       ; %bb.0: ; %entry
17920; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17921; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17922; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17923; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17924; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
17925; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
17926; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17927; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
17928; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17929; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17930; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17931; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
17932; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
17933; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17934; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
17935; GFX11-CU-NEXT:    s_endpgm
17936;
17937; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17938; GFX12-WGP:       ; %bb.0: ; %entry
17939; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17940; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17941; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17942; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17943; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
17944; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
17945; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17946; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
17947; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17948; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17949; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17950; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
17951; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
17952; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
17953; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
17954; GFX12-WGP-NEXT:    s_endpgm
17955;
17956; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17957; GFX12-CU:       ; %bb.0: ; %entry
17958; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17959; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17960; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17961; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17962; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
17963; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
17964; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17965; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
17966; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17967; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17968; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17969; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
17970; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
17971; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
17972; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
17973; GFX12-CU-NEXT:    s_endpgm
17974    ptr %out, i32 %in, i32 %old) {
17975entry:
17976  %gep = getelementptr i32, ptr %out, i32 4
17977  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
17978  %val0 = extractvalue { i32, i1 } %val, 0
17979  store i32 %val0, ptr %out, align 4
17980  ret void
17981}
17982
17983define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
17984; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
17985; GFX7:       ; %bb.0: ; %entry
17986; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17987; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17988; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17989; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17990; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17991; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17992; GFX7-NEXT:    s_mov_b32 s6, s4
17993; GFX7-NEXT:    s_mov_b32 s7, s5
17994; GFX7-NEXT:    s_mov_b32 s11, s12
17995; GFX7-NEXT:    s_mov_b32 s10, s13
17996; GFX7-NEXT:    s_add_u32 s6, s6, s11
17997; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17998; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17999; GFX7-NEXT:    s_mov_b32 s7, s10
18000; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18001; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18002; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18003; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18004; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18005; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18006; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18007; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18008; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18009; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18010; GFX7-NEXT:    flat_store_dword v[0:1], v2
18011; GFX7-NEXT:    s_endpgm
18012;
18013; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18014; GFX10-WGP:       ; %bb.0: ; %entry
18015; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18016; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18017; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18018; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18019; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18020; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18021; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18022; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18023; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18024; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18025; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18026; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18027; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18028; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18029; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18030; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18031; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18032; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18033; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18034; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18035; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18036; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18037; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18038; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18039; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18040; GFX10-WGP-NEXT:    s_endpgm
18041;
18042; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18043; GFX10-CU:       ; %bb.0: ; %entry
18044; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18045; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18046; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18047; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18048; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18049; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18050; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18051; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18052; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18053; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18054; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18055; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18056; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18057; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18058; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18059; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18060; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18061; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18062; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18063; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18064; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18065; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18066; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18067; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18068; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18069; GFX10-CU-NEXT:    s_endpgm
18070;
18071; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18072; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18073; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18074; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18075; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18076; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18077; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18078; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18079; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18080; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18081; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18082; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18083; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18084; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18085; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18086; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18087; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18088; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18089; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18091; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18092; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18093; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18095; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18096; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18097; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18098; SKIP-CACHE-INV-NEXT:    s_endpgm
18099;
18100; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18101; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18102; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18103; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18104; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18105; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18106; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18107; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18108; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18109; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18110; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18111; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18112; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18113; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18114; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18115; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18116;
18117; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18118; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18119; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18120; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18121; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18122; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18123; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18124; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18125; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18126; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18127; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18128; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18129; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18130; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18131; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18132; GFX90A-TGSPLIT-NEXT:    s_endpgm
18133;
18134; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18135; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18136; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18137; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18138; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18139; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18140; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18141; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18142; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18143; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18144; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18145; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18146; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18147; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18148; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18149; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18150;
18151; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18152; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18153; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18154; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18155; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18156; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18157; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18158; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18159; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18160; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18161; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18162; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18163; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18164; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18165; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18166; GFX940-TGSPLIT-NEXT:    s_endpgm
18167;
18168; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18169; GFX11-WGP:       ; %bb.0: ; %entry
18170; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18171; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18172; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18173; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18174; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18175; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18176; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18177; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18178; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18179; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18180; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18181; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18182; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18183; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18184; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18185; GFX11-WGP-NEXT:    s_endpgm
18186;
18187; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18188; GFX11-CU:       ; %bb.0: ; %entry
18189; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18190; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18191; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18192; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18193; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18194; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18195; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18196; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18197; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18198; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18199; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18200; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18201; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18202; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18203; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18204; GFX11-CU-NEXT:    s_endpgm
18205;
18206; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18207; GFX12-WGP:       ; %bb.0: ; %entry
18208; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18209; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18210; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18211; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18212; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18213; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18214; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18215; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18216; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18217; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18218; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18219; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18220; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18221; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
18222; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18223; GFX12-WGP-NEXT:    s_endpgm
18224;
18225; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18226; GFX12-CU:       ; %bb.0: ; %entry
18227; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18228; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18229; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18230; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18231; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18232; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18233; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18234; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18235; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18236; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18237; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18238; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18239; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18240; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18241; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18242; GFX12-CU-NEXT:    s_endpgm
18243   ptr %out, i32 %in, i32 %old) {
18244entry:
18245  %gep = getelementptr i32, ptr %out, i32 4
18246  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
18247  %val0 = extractvalue { i32, i1 } %val, 0
18248  store i32 %val0, ptr %out, align 4
18249  ret void
18250}
18251
18252define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
18253; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18254; GFX7:       ; %bb.0: ; %entry
18255; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18256; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18257; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18258; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18259; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18260; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18261; GFX7-NEXT:    s_mov_b32 s6, s4
18262; GFX7-NEXT:    s_mov_b32 s7, s5
18263; GFX7-NEXT:    s_mov_b32 s11, s12
18264; GFX7-NEXT:    s_mov_b32 s10, s13
18265; GFX7-NEXT:    s_add_u32 s6, s6, s11
18266; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18267; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18268; GFX7-NEXT:    s_mov_b32 s7, s10
18269; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18270; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18271; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18272; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18273; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18274; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18275; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18276; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18277; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18278; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18279; GFX7-NEXT:    flat_store_dword v[0:1], v2
18280; GFX7-NEXT:    s_endpgm
18281;
18282; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18283; GFX10-WGP:       ; %bb.0: ; %entry
18284; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18285; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18286; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18287; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18288; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18289; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18290; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18291; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18292; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18293; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18294; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18295; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18296; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18297; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18298; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18299; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18300; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18301; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18302; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18303; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18304; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18305; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18306; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18307; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18308; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18309; GFX10-WGP-NEXT:    s_endpgm
18310;
18311; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18312; GFX10-CU:       ; %bb.0: ; %entry
18313; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18314; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18315; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18316; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18317; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18318; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18319; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18320; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18321; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18322; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18323; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18324; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18325; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18326; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18327; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18328; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18329; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18330; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18331; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18332; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18333; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18334; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18335; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18336; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18337; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18338; GFX10-CU-NEXT:    s_endpgm
18339;
18340; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18341; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18342; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18343; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18344; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18345; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18346; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18347; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18348; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18349; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18350; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18351; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18352; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18353; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18354; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18355; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18357; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18358; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18361; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18362; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18363; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18364; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18365; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18366; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18367; SKIP-CACHE-INV-NEXT:    s_endpgm
18368;
18369; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18370; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18371; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18372; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18373; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18374; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18375; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18376; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18377; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18378; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18379; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18380; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18381; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18382; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18383; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18384; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18385;
18386; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18387; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18388; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18389; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18390; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18391; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18392; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18393; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18394; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18395; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18396; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18397; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18398; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18399; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18400; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18401; GFX90A-TGSPLIT-NEXT:    s_endpgm
18402;
18403; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18404; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18405; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18406; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18407; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18408; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18409; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18410; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18411; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18412; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18413; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18414; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18415; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18416; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18417; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18418; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18419;
18420; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18421; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18422; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18423; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18424; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18425; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18426; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18427; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18428; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18429; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18430; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18431; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18432; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18433; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18434; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18435; GFX940-TGSPLIT-NEXT:    s_endpgm
18436;
18437; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18438; GFX11-WGP:       ; %bb.0: ; %entry
18439; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18440; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18441; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18442; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18443; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18444; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18445; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18446; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18447; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18448; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18449; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18450; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18451; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18452; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18453; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18454; GFX11-WGP-NEXT:    s_endpgm
18455;
18456; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18457; GFX11-CU:       ; %bb.0: ; %entry
18458; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18459; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18460; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18461; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18462; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18463; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18464; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18465; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18466; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18467; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18468; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18469; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18470; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18471; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18472; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18473; GFX11-CU-NEXT:    s_endpgm
18474;
18475; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18476; GFX12-WGP:       ; %bb.0: ; %entry
18477; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18478; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18479; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18480; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18481; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18482; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18483; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18484; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18485; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18486; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18487; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18488; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18489; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18490; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
18491; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18492; GFX12-WGP-NEXT:    s_endpgm
18493;
18494; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18495; GFX12-CU:       ; %bb.0: ; %entry
18496; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18497; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18498; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18499; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18500; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18501; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18502; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18503; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18504; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18505; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18506; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18507; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18508; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18509; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18510; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18511; GFX12-CU-NEXT:    s_endpgm
18512   ptr %out, i32 %in, i32 %old) {
18513entry:
18514  %gep = getelementptr i32, ptr %out, i32 4
18515  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
18516  %val0 = extractvalue { i32, i1 } %val, 0
18517  store i32 %val0, ptr %out, align 4
18518  ret void
18519}
18520
18521define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
18522; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18523; GFX7:       ; %bb.0: ; %entry
18524; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18525; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18526; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18527; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18528; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18529; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18530; GFX7-NEXT:    s_mov_b32 s6, s4
18531; GFX7-NEXT:    s_mov_b32 s7, s5
18532; GFX7-NEXT:    s_mov_b32 s11, s12
18533; GFX7-NEXT:    s_mov_b32 s10, s13
18534; GFX7-NEXT:    s_add_u32 s6, s6, s11
18535; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18536; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18537; GFX7-NEXT:    s_mov_b32 s7, s10
18538; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18539; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18540; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18541; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18542; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18543; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18544; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18545; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18546; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18547; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18548; GFX7-NEXT:    flat_store_dword v[0:1], v2
18549; GFX7-NEXT:    s_endpgm
18550;
18551; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18552; GFX10-WGP:       ; %bb.0: ; %entry
18553; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18554; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18555; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18556; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18557; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18558; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18559; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18560; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18561; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18562; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18563; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18564; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18565; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18566; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18567; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18568; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18569; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18570; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18571; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18572; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18573; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18574; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18575; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18576; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18577; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18578; GFX10-WGP-NEXT:    s_endpgm
18579;
18580; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18581; GFX10-CU:       ; %bb.0: ; %entry
18582; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18583; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18584; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18585; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18586; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18587; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18588; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18589; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18590; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18591; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18592; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18593; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18594; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18595; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18596; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18597; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18598; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18599; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18600; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18601; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18602; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18603; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18604; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18605; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18606; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18607; GFX10-CU-NEXT:    s_endpgm
18608;
18609; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18610; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18611; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18612; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18613; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18614; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18615; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18616; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18617; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18618; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18619; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18620; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18621; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18622; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18623; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18624; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18625; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18626; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18627; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18628; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18629; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18630; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18631; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18632; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18633; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18634; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18635; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18636; SKIP-CACHE-INV-NEXT:    s_endpgm
18637;
18638; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18639; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18640; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18641; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18642; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18643; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18644; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18645; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18646; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18647; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18648; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18649; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18650; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18651; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18652; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18653; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18654;
18655; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18656; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18657; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18658; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18659; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18660; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18661; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18662; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18663; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18664; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18665; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18666; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18667; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18668; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18669; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18670; GFX90A-TGSPLIT-NEXT:    s_endpgm
18671;
18672; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18673; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18674; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18675; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18676; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18677; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18678; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18679; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18680; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18681; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18682; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18683; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18684; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18685; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18686; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18687; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18688;
18689; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18690; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18691; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18692; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18693; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18694; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18695; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18696; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18697; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18698; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18699; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18700; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18701; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18702; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18703; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18704; GFX940-TGSPLIT-NEXT:    s_endpgm
18705;
18706; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18707; GFX11-WGP:       ; %bb.0: ; %entry
18708; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18709; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18710; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18711; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18712; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18713; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18714; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18715; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18716; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18717; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18718; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18719; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18720; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18721; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18722; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18723; GFX11-WGP-NEXT:    s_endpgm
18724;
18725; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18726; GFX11-CU:       ; %bb.0: ; %entry
18727; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18728; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18729; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18730; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18731; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
18732; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
18733; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18734; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
18735; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18736; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18737; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18738; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
18739; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
18740; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18741; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
18742; GFX11-CU-NEXT:    s_endpgm
18743;
18744; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18745; GFX12-WGP:       ; %bb.0: ; %entry
18746; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18747; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18748; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18749; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18750; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
18751; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
18752; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18753; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
18754; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18755; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18756; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18757; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
18758; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
18759; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
18760; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
18761; GFX12-WGP-NEXT:    s_endpgm
18762;
18763; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18764; GFX12-CU:       ; %bb.0: ; %entry
18765; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18766; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18767; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18768; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18769; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
18770; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
18771; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18772; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
18773; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18774; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18775; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18776; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
18777; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
18778; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
18779; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
18780; GFX12-CU-NEXT:    s_endpgm
18781   ptr %out, i32 %in, i32 %old) {
18782entry:
18783  %gep = getelementptr i32, ptr %out, i32 4
18784  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
18785  %val0 = extractvalue { i32, i1 } %val, 0
18786  store i32 %val0, ptr %out, align 4
18787  ret void
18788}
18789
18790define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
18791; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18792; GFX7:       ; %bb.0: ; %entry
18793; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18794; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18795; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18796; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18797; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18798; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18799; GFX7-NEXT:    s_mov_b32 s6, s4
18800; GFX7-NEXT:    s_mov_b32 s7, s5
18801; GFX7-NEXT:    s_mov_b32 s11, s12
18802; GFX7-NEXT:    s_mov_b32 s10, s13
18803; GFX7-NEXT:    s_add_u32 s6, s6, s11
18804; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18805; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18806; GFX7-NEXT:    s_mov_b32 s7, s10
18807; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18808; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18809; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18810; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18811; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18812; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18813; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18814; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18815; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18816; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18817; GFX7-NEXT:    flat_store_dword v[0:1], v2
18818; GFX7-NEXT:    s_endpgm
18819;
18820; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18821; GFX10-WGP:       ; %bb.0: ; %entry
18822; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
18823; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18824; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
18825; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
18826; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
18827; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18828; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
18829; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
18830; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
18831; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
18832; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
18833; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
18834; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18835; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
18836; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
18837; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
18838; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18839; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
18840; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
18841; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18842; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18843; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
18844; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
18845; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18846; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
18847; GFX10-WGP-NEXT:    s_endpgm
18848;
18849; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18850; GFX10-CU:       ; %bb.0: ; %entry
18851; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
18852; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18853; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
18854; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
18855; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
18856; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18857; GFX10-CU-NEXT:    s_mov_b32 s6, s4
18858; GFX10-CU-NEXT:    s_mov_b32 s7, s5
18859; GFX10-CU-NEXT:    s_mov_b32 s11, s12
18860; GFX10-CU-NEXT:    s_mov_b32 s10, s13
18861; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
18862; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
18863; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18864; GFX10-CU-NEXT:    s_mov_b32 s7, s10
18865; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
18866; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
18867; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18868; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
18869; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
18870; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18871; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18872; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
18873; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
18874; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18875; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
18876; GFX10-CU-NEXT:    s_endpgm
18877;
18878; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18879; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18880; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18881; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18882; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18883; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18884; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
18885; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18886; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
18887; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
18888; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
18889; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
18890; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
18891; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
18892; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18893; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18894; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
18895; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
18896; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18897; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
18898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
18899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
18900; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18901; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
18902; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
18903; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18904; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
18905; SKIP-CACHE-INV-NEXT:    s_endpgm
18906;
18907; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18908; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18909; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18910; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18911; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18912; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18913; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18914; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18915; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18916; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18917; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18918; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18919; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18920; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18921; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18922; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18923;
18924; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18925; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18926; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18927; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18928; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18929; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18930; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18931; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
18932; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18933; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18934; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18935; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18936; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18937; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18938; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
18939; GFX90A-TGSPLIT-NEXT:    s_endpgm
18940;
18941; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18942; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18943; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18944; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18945; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18946; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18947; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18948; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18949; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18950; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18951; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18952; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18953; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18954; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18955; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18956; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18957;
18958; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18959; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18960; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18961; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18962; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18963; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18964; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18965; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
18966; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18967; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
18968; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18969; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18970; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
18971; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18972; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
18973; GFX940-TGSPLIT-NEXT:    s_endpgm
18974;
18975; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18976; GFX11-WGP:       ; %bb.0: ; %entry
18977; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18978; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18979; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18980; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18981; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
18982; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
18983; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18984; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
18985; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18986; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18987; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18988; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
18989; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
18990; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18991; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
18992; GFX11-WGP-NEXT:    s_endpgm
18993;
18994; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
18995; GFX11-CU:       ; %bb.0: ; %entry
18996; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18997; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18998; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18999; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19000; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19001; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19002; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19003; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19004; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19005; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19006; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19007; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19008; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19009; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19010; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19011; GFX11-CU-NEXT:    s_endpgm
19012;
19013; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
19014; GFX12-WGP:       ; %bb.0: ; %entry
19015; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19016; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19017; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19018; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19019; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19020; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19021; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19022; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19023; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19024; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19025; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19026; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19027; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19028; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
19029; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19030; GFX12-WGP-NEXT:    s_endpgm
19031;
19032; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
19033; GFX12-CU:       ; %bb.0: ; %entry
19034; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19035; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19036; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19037; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19038; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19039; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19040; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19041; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19042; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19043; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19044; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19045; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19046; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19047; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19048; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19049; GFX12-CU-NEXT:    s_endpgm
19050   ptr %out, i32 %in, i32 %old) {
19051entry:
19052  %gep = getelementptr i32, ptr %out, i32 4
19053  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
19054  %val0 = extractvalue { i32, i1 } %val, 0
19055  store i32 %val0, ptr %out, align 4
19056  ret void
19057}
19058
19059define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
19060; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19061; GFX7:       ; %bb.0: ; %entry
19062; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
19063; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19064; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
19065; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
19066; GFX7-NEXT:    s_mov_b64 s[12:13], 16
19067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19068; GFX7-NEXT:    s_mov_b32 s6, s4
19069; GFX7-NEXT:    s_mov_b32 s7, s5
19070; GFX7-NEXT:    s_mov_b32 s11, s12
19071; GFX7-NEXT:    s_mov_b32 s10, s13
19072; GFX7-NEXT:    s_add_u32 s6, s6, s11
19073; GFX7-NEXT:    s_addc_u32 s10, s7, s10
19074; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19075; GFX7-NEXT:    s_mov_b32 s7, s10
19076; GFX7-NEXT:    v_mov_b32_e32 v2, s9
19077; GFX7-NEXT:    v_mov_b32_e32 v0, s8
19078; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19079; GFX7-NEXT:    v_mov_b32_e32 v3, v0
19080; GFX7-NEXT:    v_mov_b32_e32 v0, s6
19081; GFX7-NEXT:    v_mov_b32_e32 v1, s7
19082; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19083; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19084; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19085; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19086; GFX7-NEXT:    flat_store_dword v[0:1], v2
19087; GFX7-NEXT:    s_endpgm
19088;
19089; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19090; GFX10-WGP:       ; %bb.0: ; %entry
19091; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
19092; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19093; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
19094; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
19095; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
19096; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19097; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
19098; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
19099; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
19100; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
19101; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
19102; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
19103; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19104; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
19105; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
19106; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
19107; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19108; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
19109; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
19110; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19111; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19112; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
19113; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
19114; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19115; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
19116; GFX10-WGP-NEXT:    s_endpgm
19117;
19118; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19119; GFX10-CU:       ; %bb.0: ; %entry
19120; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
19121; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
19122; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
19123; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
19124; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
19125; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19126; GFX10-CU-NEXT:    s_mov_b32 s6, s4
19127; GFX10-CU-NEXT:    s_mov_b32 s7, s5
19128; GFX10-CU-NEXT:    s_mov_b32 s11, s12
19129; GFX10-CU-NEXT:    s_mov_b32 s10, s13
19130; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
19131; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
19132; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19133; GFX10-CU-NEXT:    s_mov_b32 s7, s10
19134; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
19135; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
19136; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19137; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
19138; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
19139; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19140; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19141; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
19142; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
19143; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19144; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
19145; GFX10-CU-NEXT:    s_endpgm
19146;
19147; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19148; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19149; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19150; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19151; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19152; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19153; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
19154; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19155; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
19156; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
19157; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
19158; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
19159; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
19160; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
19161; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19162; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19163; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
19164; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
19165; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19166; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
19167; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
19168; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
19169; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19170; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
19171; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
19172; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19173; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
19174; SKIP-CACHE-INV-NEXT:    s_endpgm
19175;
19176; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19177; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19178; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19179; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19180; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19181; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19182; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19183; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19184; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19185; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19186; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19187; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19188; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19189; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19190; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19191; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19192;
19193; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19194; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19195; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19196; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19197; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19198; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19199; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19200; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
19201; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19202; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19203; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19204; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19205; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19206; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19207; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
19208; GFX90A-TGSPLIT-NEXT:    s_endpgm
19209;
19210; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19211; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19212; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19213; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19214; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19215; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19216; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19217; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19218; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19219; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19220; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19221; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19222; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19223; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19224; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19225; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19226;
19227; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19228; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19229; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19230; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19231; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19232; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19233; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19234; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
19235; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19236; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
19237; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19238; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19239; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
19240; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19241; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
19242; GFX940-TGSPLIT-NEXT:    s_endpgm
19243;
19244; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19245; GFX11-WGP:       ; %bb.0: ; %entry
19246; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19247; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19248; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19249; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19250; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
19251; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
19252; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19253; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
19254; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19255; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19256; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19257; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
19258; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
19259; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19260; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
19261; GFX11-WGP-NEXT:    s_endpgm
19262;
19263; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19264; GFX11-CU:       ; %bb.0: ; %entry
19265; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19266; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19267; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19268; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19269; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
19270; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
19271; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19272; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
19273; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19274; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19275; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19276; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
19277; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
19278; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19279; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
19280; GFX11-CU-NEXT:    s_endpgm
19281;
19282; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19283; GFX12-WGP:       ; %bb.0: ; %entry
19284; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19285; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19286; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19287; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19288; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
19289; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
19290; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19291; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
19292; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19293; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19294; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19295; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
19296; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
19297; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
19298; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
19299; GFX12-WGP-NEXT:    s_endpgm
19300;
19301; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19302; GFX12-CU:       ; %bb.0: ; %entry
19303; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19304; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19305; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19306; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19307; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
19308; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
19309; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19310; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
19311; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19312; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19313; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19314; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
19315; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
19316; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
19317; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
19318; GFX12-CU-NEXT:    s_endpgm
19319    ptr %out, i32 %in, i32 %old) {
19320entry:
19321  %gep = getelementptr i32, ptr %out, i32 4
19322  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
19323  %val0 = extractvalue { i32, i1 } %val, 0
19324  store i32 %val0, ptr %out, align 4
19325  ret void
19326}
19327