xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
15
16define amdgpu_kernel void @global_wavefront_unordered_load(
17; GFX6-LABEL: global_wavefront_unordered_load:
18; GFX6:       ; %bb.0: ; %entry
19; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
20; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
21; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
22; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
23; GFX6-NEXT:    s_mov_b32 s6, s9
24; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
25; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
26; GFX6-NEXT:    s_mov_b32 s13, -1
27; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
28; GFX6-NEXT:    s_mov_b32 s9, s6
29; GFX6-NEXT:    s_mov_b32 s10, s13
30; GFX6-NEXT:    s_mov_b32 s11, s12
31; GFX6-NEXT:    s_mov_b32 s14, s5
32; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
33; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
34; GFX6-NEXT:    s_mov_b32 s5, s14
35; GFX6-NEXT:    s_mov_b32 s6, s13
36; GFX6-NEXT:    s_mov_b32 s7, s12
37; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
38; GFX6-NEXT:    s_waitcnt vmcnt(0)
39; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
40; GFX6-NEXT:    s_endpgm
41;
42; GFX7-LABEL: global_wavefront_unordered_load:
43; GFX7:       ; %bb.0: ; %entry
44; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
45; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
46; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX7-NEXT:    v_mov_b32_e32 v0, s6
48; GFX7-NEXT:    v_mov_b32_e32 v1, s7
49; GFX7-NEXT:    flat_load_dword v2, v[0:1]
50; GFX7-NEXT:    v_mov_b32_e32 v0, s4
51; GFX7-NEXT:    v_mov_b32_e32 v1, s5
52; GFX7-NEXT:    s_waitcnt vmcnt(0)
53; GFX7-NEXT:    flat_store_dword v[0:1], v2
54; GFX7-NEXT:    s_endpgm
55;
56; GFX10-WGP-LABEL: global_wavefront_unordered_load:
57; GFX10-WGP:       ; %bb.0: ; %entry
58; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
59; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
60; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
61; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
63; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
64; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
65; GFX10-WGP-NEXT:    s_endpgm
66;
67; GFX10-CU-LABEL: global_wavefront_unordered_load:
68; GFX10-CU:       ; %bb.0: ; %entry
69; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
70; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
71; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
72; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
74; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
75; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
76; GFX10-CU-NEXT:    s_endpgm
77;
78; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_load:
79; SKIP-CACHE-INV:       ; %bb.0: ; %entry
80; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
81; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
82; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
83; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
84; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
85; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
86; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
87; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
88; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
89; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
90; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
91; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
92; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
93; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
94; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
95; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
96; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
97; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
98; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
99; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
100; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
101; SKIP-CACHE-INV-NEXT:    s_endpgm
102;
103; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
104; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
105; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
106; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
107; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
108; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
110; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
111; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
112; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
113;
114; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load:
115; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
116; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
117; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
118; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
119; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
121; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
122; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
123; GFX90A-TGSPLIT-NEXT:    s_endpgm
124;
125; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
126; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
127; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
128; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
129; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
130; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
132; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
133; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
134; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
135;
136; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_load:
137; GFX940-TGSPLIT:       ; %bb.0: ; %entry
138; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
139; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
140; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
141; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
143; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
144; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
145; GFX940-TGSPLIT-NEXT:    s_endpgm
146;
147; GFX11-WGP-LABEL: global_wavefront_unordered_load:
148; GFX11-WGP:       ; %bb.0: ; %entry
149; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
150; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
151; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
152; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
154; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
155; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
156; GFX11-WGP-NEXT:    s_endpgm
157;
158; GFX11-CU-LABEL: global_wavefront_unordered_load:
159; GFX11-CU:       ; %bb.0: ; %entry
160; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
161; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
162; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
163; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
165; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
166; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
167; GFX11-CU-NEXT:    s_endpgm
168;
169; GFX12-WGP-LABEL: global_wavefront_unordered_load:
170; GFX12-WGP:       ; %bb.0: ; %entry
171; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
172; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
173; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
174; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
175; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
176; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
177; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
178; GFX12-WGP-NEXT:    s_endpgm
179;
180; GFX12-CU-LABEL: global_wavefront_unordered_load:
181; GFX12-CU:       ; %bb.0: ; %entry
182; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
183; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
184; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
185; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
186; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
187; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
188; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
189; GFX12-CU-NEXT:    s_endpgm
190    ptr addrspace(1) %in, ptr addrspace(1) %out) {
191entry:
192  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4
193  store i32 %val, ptr addrspace(1) %out
194  ret void
195}
196
197define amdgpu_kernel void @global_wavefront_monotonic_load(
198; GFX6-LABEL: global_wavefront_monotonic_load:
199; GFX6:       ; %bb.0: ; %entry
200; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
201; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
202; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
203; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX6-NEXT:    s_mov_b32 s6, s9
205; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
206; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
207; GFX6-NEXT:    s_mov_b32 s13, -1
208; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
209; GFX6-NEXT:    s_mov_b32 s9, s6
210; GFX6-NEXT:    s_mov_b32 s10, s13
211; GFX6-NEXT:    s_mov_b32 s11, s12
212; GFX6-NEXT:    s_mov_b32 s14, s5
213; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
214; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
215; GFX6-NEXT:    s_mov_b32 s5, s14
216; GFX6-NEXT:    s_mov_b32 s6, s13
217; GFX6-NEXT:    s_mov_b32 s7, s12
218; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
219; GFX6-NEXT:    s_waitcnt vmcnt(0)
220; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
221; GFX6-NEXT:    s_endpgm
222;
223; GFX7-LABEL: global_wavefront_monotonic_load:
224; GFX7:       ; %bb.0: ; %entry
225; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
226; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
227; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX7-NEXT:    v_mov_b32_e32 v0, s6
229; GFX7-NEXT:    v_mov_b32_e32 v1, s7
230; GFX7-NEXT:    flat_load_dword v2, v[0:1]
231; GFX7-NEXT:    v_mov_b32_e32 v0, s4
232; GFX7-NEXT:    v_mov_b32_e32 v1, s5
233; GFX7-NEXT:    s_waitcnt vmcnt(0)
234; GFX7-NEXT:    flat_store_dword v[0:1], v2
235; GFX7-NEXT:    s_endpgm
236;
237; GFX10-WGP-LABEL: global_wavefront_monotonic_load:
238; GFX10-WGP:       ; %bb.0: ; %entry
239; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
240; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
241; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
242; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
244; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
245; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
246; GFX10-WGP-NEXT:    s_endpgm
247;
248; GFX10-CU-LABEL: global_wavefront_monotonic_load:
249; GFX10-CU:       ; %bb.0: ; %entry
250; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
251; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
252; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
253; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
255; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
256; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
257; GFX10-CU-NEXT:    s_endpgm
258;
259; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_load:
260; SKIP-CACHE-INV:       ; %bb.0: ; %entry
261; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
262; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
263; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
264; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
265; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
266; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
267; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
268; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
269; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
270; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
271; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
272; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
273; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
274; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
275; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
276; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
277; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
278; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
279; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
280; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
281; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
282; SKIP-CACHE-INV-NEXT:    s_endpgm
283;
284; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
285; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
286; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
287; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
288; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
289; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
291; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
292; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
293; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
294;
295; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load:
296; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
297; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
298; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
299; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
300; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
302; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
303; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
304; GFX90A-TGSPLIT-NEXT:    s_endpgm
305;
306; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
307; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
308; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
309; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
310; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
311; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
313; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
314; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
315; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
316;
317; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_load:
318; GFX940-TGSPLIT:       ; %bb.0: ; %entry
319; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
320; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
321; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
322; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
324; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
325; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
326; GFX940-TGSPLIT-NEXT:    s_endpgm
327;
328; GFX11-WGP-LABEL: global_wavefront_monotonic_load:
329; GFX11-WGP:       ; %bb.0: ; %entry
330; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
331; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
332; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
333; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
335; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
336; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
337; GFX11-WGP-NEXT:    s_endpgm
338;
339; GFX11-CU-LABEL: global_wavefront_monotonic_load:
340; GFX11-CU:       ; %bb.0: ; %entry
341; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
342; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
343; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
344; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
346; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
347; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
348; GFX11-CU-NEXT:    s_endpgm
349;
350; GFX12-WGP-LABEL: global_wavefront_monotonic_load:
351; GFX12-WGP:       ; %bb.0: ; %entry
352; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
353; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
354; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
355; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
356; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
357; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
358; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
359; GFX12-WGP-NEXT:    s_endpgm
360;
361; GFX12-CU-LABEL: global_wavefront_monotonic_load:
362; GFX12-CU:       ; %bb.0: ; %entry
363; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
364; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
365; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
366; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
367; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
368; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
369; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
370; GFX12-CU-NEXT:    s_endpgm
371    ptr addrspace(1) %in, ptr addrspace(1) %out) {
372entry:
373  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4
374  store i32 %val, ptr addrspace(1) %out
375  ret void
376}
377
378define amdgpu_kernel void @global_wavefront_acquire_load(
379; GFX6-LABEL: global_wavefront_acquire_load:
380; GFX6:       ; %bb.0: ; %entry
381; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
382; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
383; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
384; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX6-NEXT:    s_mov_b32 s6, s9
386; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
387; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
388; GFX6-NEXT:    s_mov_b32 s13, -1
389; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
390; GFX6-NEXT:    s_mov_b32 s9, s6
391; GFX6-NEXT:    s_mov_b32 s10, s13
392; GFX6-NEXT:    s_mov_b32 s11, s12
393; GFX6-NEXT:    s_mov_b32 s14, s5
394; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
395; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
396; GFX6-NEXT:    s_mov_b32 s5, s14
397; GFX6-NEXT:    s_mov_b32 s6, s13
398; GFX6-NEXT:    s_mov_b32 s7, s12
399; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
400; GFX6-NEXT:    s_waitcnt vmcnt(0)
401; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
402; GFX6-NEXT:    s_endpgm
403;
404; GFX7-LABEL: global_wavefront_acquire_load:
405; GFX7:       ; %bb.0: ; %entry
406; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
407; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
408; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX7-NEXT:    v_mov_b32_e32 v0, s6
410; GFX7-NEXT:    v_mov_b32_e32 v1, s7
411; GFX7-NEXT:    flat_load_dword v2, v[0:1]
412; GFX7-NEXT:    v_mov_b32_e32 v0, s4
413; GFX7-NEXT:    v_mov_b32_e32 v1, s5
414; GFX7-NEXT:    s_waitcnt vmcnt(0)
415; GFX7-NEXT:    flat_store_dword v[0:1], v2
416; GFX7-NEXT:    s_endpgm
417;
418; GFX10-WGP-LABEL: global_wavefront_acquire_load:
419; GFX10-WGP:       ; %bb.0: ; %entry
420; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
421; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
422; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
423; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
425; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
426; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
427; GFX10-WGP-NEXT:    s_endpgm
428;
429; GFX10-CU-LABEL: global_wavefront_acquire_load:
430; GFX10-CU:       ; %bb.0: ; %entry
431; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
432; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
433; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
434; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
436; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
437; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
438; GFX10-CU-NEXT:    s_endpgm
439;
440; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_load:
441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
442; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
444; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
445; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
446; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
447; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
448; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
449; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
450; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
451; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
452; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
453; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
454; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
455; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
456; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
457; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
458; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
459; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
460; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
461; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
462; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
463; SKIP-CACHE-INV-NEXT:    s_endpgm
464;
465; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
466; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
467; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
468; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
469; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
470; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
472; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
473; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
474; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
475;
476; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load:
477; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
478; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
479; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
480; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
481; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
483; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
484; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
485; GFX90A-TGSPLIT-NEXT:    s_endpgm
486;
487; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
488; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
489; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
490; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
491; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
492; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
494; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
495; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
496; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
497;
498; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_load:
499; GFX940-TGSPLIT:       ; %bb.0: ; %entry
500; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
501; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
502; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
503; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
504; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
505; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
506; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
507; GFX940-TGSPLIT-NEXT:    s_endpgm
508;
509; GFX11-WGP-LABEL: global_wavefront_acquire_load:
510; GFX11-WGP:       ; %bb.0: ; %entry
511; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
512; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
513; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
514; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
516; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
517; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
518; GFX11-WGP-NEXT:    s_endpgm
519;
520; GFX11-CU-LABEL: global_wavefront_acquire_load:
521; GFX11-CU:       ; %bb.0: ; %entry
522; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
523; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
524; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
525; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
527; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
528; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
529; GFX11-CU-NEXT:    s_endpgm
530;
531; GFX12-WGP-LABEL: global_wavefront_acquire_load:
532; GFX12-WGP:       ; %bb.0: ; %entry
533; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
534; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
535; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
536; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
537; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
538; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
539; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
540; GFX12-WGP-NEXT:    s_endpgm
541;
542; GFX12-CU-LABEL: global_wavefront_acquire_load:
543; GFX12-CU:       ; %bb.0: ; %entry
544; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
545; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
546; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
547; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
548; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
549; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
550; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
551; GFX12-CU-NEXT:    s_endpgm
552    ptr addrspace(1) %in, ptr addrspace(1) %out) {
553entry:
554  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4
555  store i32 %val, ptr addrspace(1) %out
556  ret void
557}
558
559define amdgpu_kernel void @global_wavefront_seq_cst_load(
560; GFX6-LABEL: global_wavefront_seq_cst_load:
561; GFX6:       ; %bb.0: ; %entry
562; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
563; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
564; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
565; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX6-NEXT:    s_mov_b32 s6, s9
567; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
568; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
569; GFX6-NEXT:    s_mov_b32 s13, -1
570; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
571; GFX6-NEXT:    s_mov_b32 s9, s6
572; GFX6-NEXT:    s_mov_b32 s10, s13
573; GFX6-NEXT:    s_mov_b32 s11, s12
574; GFX6-NEXT:    s_mov_b32 s14, s5
575; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
576; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
577; GFX6-NEXT:    s_mov_b32 s5, s14
578; GFX6-NEXT:    s_mov_b32 s6, s13
579; GFX6-NEXT:    s_mov_b32 s7, s12
580; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
581; GFX6-NEXT:    s_waitcnt vmcnt(0)
582; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
583; GFX6-NEXT:    s_endpgm
584;
585; GFX7-LABEL: global_wavefront_seq_cst_load:
586; GFX7:       ; %bb.0: ; %entry
587; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
588; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
589; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX7-NEXT:    v_mov_b32_e32 v0, s6
591; GFX7-NEXT:    v_mov_b32_e32 v1, s7
592; GFX7-NEXT:    flat_load_dword v2, v[0:1]
593; GFX7-NEXT:    v_mov_b32_e32 v0, s4
594; GFX7-NEXT:    v_mov_b32_e32 v1, s5
595; GFX7-NEXT:    s_waitcnt vmcnt(0)
596; GFX7-NEXT:    flat_store_dword v[0:1], v2
597; GFX7-NEXT:    s_endpgm
598;
599; GFX10-WGP-LABEL: global_wavefront_seq_cst_load:
600; GFX10-WGP:       ; %bb.0: ; %entry
601; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
602; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
603; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
604; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
606; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
607; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
608; GFX10-WGP-NEXT:    s_endpgm
609;
610; GFX10-CU-LABEL: global_wavefront_seq_cst_load:
611; GFX10-CU:       ; %bb.0: ; %entry
612; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
613; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
614; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
615; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
616; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
617; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
618; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
619; GFX10-CU-NEXT:    s_endpgm
620;
621; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_load:
622; SKIP-CACHE-INV:       ; %bb.0: ; %entry
623; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
624; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
625; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
626; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
627; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
628; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
630; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
631; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
632; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
633; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
634; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
635; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
636; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
637; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
638; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
639; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
640; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
641; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
642; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
643; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
644; SKIP-CACHE-INV-NEXT:    s_endpgm
645;
646; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
647; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
648; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
649; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
650; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
651; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
653; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
654; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
655; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
656;
657; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
658; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
659; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
660; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
661; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
662; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
663; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
664; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
665; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
666; GFX90A-TGSPLIT-NEXT:    s_endpgm
667;
668; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
669; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
670; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
671; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
672; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
673; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
675; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
676; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
677; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
678;
679; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
680; GFX940-TGSPLIT:       ; %bb.0: ; %entry
681; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
682; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
683; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
684; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
685; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
686; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
687; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
688; GFX940-TGSPLIT-NEXT:    s_endpgm
689;
690; GFX11-WGP-LABEL: global_wavefront_seq_cst_load:
691; GFX11-WGP:       ; %bb.0: ; %entry
692; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
693; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
694; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
695; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
697; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
698; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
699; GFX11-WGP-NEXT:    s_endpgm
700;
701; GFX11-CU-LABEL: global_wavefront_seq_cst_load:
702; GFX11-CU:       ; %bb.0: ; %entry
703; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
704; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
705; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
706; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
708; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
709; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
710; GFX11-CU-NEXT:    s_endpgm
711;
712; GFX12-WGP-LABEL: global_wavefront_seq_cst_load:
713; GFX12-WGP:       ; %bb.0: ; %entry
714; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
715; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
716; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
717; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
718; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
719; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
720; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
721; GFX12-WGP-NEXT:    s_endpgm
722;
723; GFX12-CU-LABEL: global_wavefront_seq_cst_load:
724; GFX12-CU:       ; %bb.0: ; %entry
725; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
726; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
727; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
728; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
729; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
730; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
731; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
732; GFX12-CU-NEXT:    s_endpgm
733    ptr addrspace(1) %in, ptr addrspace(1) %out) {
734entry:
735  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4
736  store i32 %val, ptr addrspace(1) %out
737  ret void
738}
739
740define amdgpu_kernel void @global_wavefront_unordered_store(
741; GFX6-LABEL: global_wavefront_unordered_store:
742; GFX6:       ; %bb.0: ; %entry
743; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
744; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
745; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
746; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX6-NEXT:    s_mov_b32 s11, s5
748; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
749; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
750; GFX6-NEXT:    s_mov_b32 s10, -1
751; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
752; GFX6-NEXT:    s_mov_b32 s5, s11
753; GFX6-NEXT:    s_mov_b32 s6, s10
754; GFX6-NEXT:    s_mov_b32 s7, s9
755; GFX6-NEXT:    v_mov_b32_e32 v0, s8
756; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
757; GFX6-NEXT:    s_endpgm
758;
759; GFX7-LABEL: global_wavefront_unordered_store:
760; GFX7:       ; %bb.0: ; %entry
761; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
762; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
763; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX7-NEXT:    v_mov_b32_e32 v0, s6
765; GFX7-NEXT:    v_mov_b32_e32 v1, s7
766; GFX7-NEXT:    v_mov_b32_e32 v2, s4
767; GFX7-NEXT:    flat_store_dword v[0:1], v2
768; GFX7-NEXT:    s_endpgm
769;
770; GFX10-WGP-LABEL: global_wavefront_unordered_store:
771; GFX10-WGP:       ; %bb.0: ; %entry
772; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
773; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
774; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
775; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
777; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
778; GFX10-WGP-NEXT:    s_endpgm
779;
780; GFX10-CU-LABEL: global_wavefront_unordered_store:
781; GFX10-CU:       ; %bb.0: ; %entry
782; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
783; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
784; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
785; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
787; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
788; GFX10-CU-NEXT:    s_endpgm
789;
790; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_store:
791; SKIP-CACHE-INV:       ; %bb.0: ; %entry
792; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
793; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
794; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
795; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
796; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
797; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
798; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
799; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
800; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
801; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
802; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
803; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
805; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
806; SKIP-CACHE-INV-NEXT:    s_endpgm
807;
808; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
809; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
810; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
811; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
812; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
813; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
815; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
816; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
817;
818; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store:
819; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
820; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
821; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
822; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
823; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
824; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
825; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
826; GFX90A-TGSPLIT-NEXT:    s_endpgm
827;
828; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
829; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
830; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
831; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
832; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
833; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
835; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
836; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
837;
838; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_store:
839; GFX940-TGSPLIT:       ; %bb.0: ; %entry
840; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
841; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
842; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
843; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
845; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
846; GFX940-TGSPLIT-NEXT:    s_endpgm
847;
848; GFX11-WGP-LABEL: global_wavefront_unordered_store:
849; GFX11-WGP:       ; %bb.0: ; %entry
850; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
851; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
852; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
853; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
855; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
856; GFX11-WGP-NEXT:    s_endpgm
857;
858; GFX11-CU-LABEL: global_wavefront_unordered_store:
859; GFX11-CU:       ; %bb.0: ; %entry
860; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
861; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
862; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
863; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
865; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
866; GFX11-CU-NEXT:    s_endpgm
867;
868; GFX12-WGP-LABEL: global_wavefront_unordered_store:
869; GFX12-WGP:       ; %bb.0: ; %entry
870; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
871; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
872; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
873; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
874; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
875; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
876; GFX12-WGP-NEXT:    s_endpgm
877;
878; GFX12-CU-LABEL: global_wavefront_unordered_store:
879; GFX12-CU:       ; %bb.0: ; %entry
880; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
881; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
882; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
883; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
884; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
885; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
886; GFX12-CU-NEXT:    s_endpgm
887    i32 %in, ptr addrspace(1) %out) {
888entry:
889  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
890  ret void
891}
892
893define amdgpu_kernel void @global_wavefront_monotonic_store(
894; GFX6-LABEL: global_wavefront_monotonic_store:
895; GFX6:       ; %bb.0: ; %entry
896; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
897; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
898; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
899; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX6-NEXT:    s_mov_b32 s11, s5
901; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
902; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
903; GFX6-NEXT:    s_mov_b32 s10, -1
904; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
905; GFX6-NEXT:    s_mov_b32 s5, s11
906; GFX6-NEXT:    s_mov_b32 s6, s10
907; GFX6-NEXT:    s_mov_b32 s7, s9
908; GFX6-NEXT:    v_mov_b32_e32 v0, s8
909; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
910; GFX6-NEXT:    s_endpgm
911;
912; GFX7-LABEL: global_wavefront_monotonic_store:
913; GFX7:       ; %bb.0: ; %entry
914; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
915; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX7-NEXT:    v_mov_b32_e32 v0, s6
918; GFX7-NEXT:    v_mov_b32_e32 v1, s7
919; GFX7-NEXT:    v_mov_b32_e32 v2, s4
920; GFX7-NEXT:    flat_store_dword v[0:1], v2
921; GFX7-NEXT:    s_endpgm
922;
923; GFX10-WGP-LABEL: global_wavefront_monotonic_store:
924; GFX10-WGP:       ; %bb.0: ; %entry
925; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
926; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
927; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
928; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
930; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
931; GFX10-WGP-NEXT:    s_endpgm
932;
933; GFX10-CU-LABEL: global_wavefront_monotonic_store:
934; GFX10-CU:       ; %bb.0: ; %entry
935; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
936; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
937; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
938; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
940; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
941; GFX10-CU-NEXT:    s_endpgm
942;
943; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_store:
944; SKIP-CACHE-INV:       ; %bb.0: ; %entry
945; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
946; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
947; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
948; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
949; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
950; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
951; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
952; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
953; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
954; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
955; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
956; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
957; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
958; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
959; SKIP-CACHE-INV-NEXT:    s_endpgm
960;
961; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
962; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
963; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
964; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
965; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
966; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
968; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
969; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
970;
971; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store:
972; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
973; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
974; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
975; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
976; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
978; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
979; GFX90A-TGSPLIT-NEXT:    s_endpgm
980;
981; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
982; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
983; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
984; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
985; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
986; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
988; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
989; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
990;
991; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_store:
992; GFX940-TGSPLIT:       ; %bb.0: ; %entry
993; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
994; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
995; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
996; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
997; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
998; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
999; GFX940-TGSPLIT-NEXT:    s_endpgm
1000;
1001; GFX11-WGP-LABEL: global_wavefront_monotonic_store:
1002; GFX11-WGP:       ; %bb.0: ; %entry
1003; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1004; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1005; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1006; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1008; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1009; GFX11-WGP-NEXT:    s_endpgm
1010;
1011; GFX11-CU-LABEL: global_wavefront_monotonic_store:
1012; GFX11-CU:       ; %bb.0: ; %entry
1013; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1014; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1015; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1016; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1018; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1019; GFX11-CU-NEXT:    s_endpgm
1020;
1021; GFX12-WGP-LABEL: global_wavefront_monotonic_store:
1022; GFX12-WGP:       ; %bb.0: ; %entry
1023; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1024; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1025; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1026; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1027; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1028; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1029; GFX12-WGP-NEXT:    s_endpgm
1030;
1031; GFX12-CU-LABEL: global_wavefront_monotonic_store:
1032; GFX12-CU:       ; %bb.0: ; %entry
1033; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1034; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1035; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1036; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1037; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1038; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1039; GFX12-CU-NEXT:    s_endpgm
1040    i32 %in, ptr addrspace(1) %out) {
1041entry:
1042  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
1043  ret void
1044}
1045
1046define amdgpu_kernel void @global_wavefront_release_store(
1047; GFX6-LABEL: global_wavefront_release_store:
1048; GFX6:       ; %bb.0: ; %entry
1049; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
1050; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
1051; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1052; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX6-NEXT:    s_mov_b32 s11, s5
1054; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1055; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1056; GFX6-NEXT:    s_mov_b32 s10, -1
1057; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1058; GFX6-NEXT:    s_mov_b32 s5, s11
1059; GFX6-NEXT:    s_mov_b32 s6, s10
1060; GFX6-NEXT:    s_mov_b32 s7, s9
1061; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1062; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1063; GFX6-NEXT:    s_endpgm
1064;
1065; GFX7-LABEL: global_wavefront_release_store:
1066; GFX7:       ; %bb.0: ; %entry
1067; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1068; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1069; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1071; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1072; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1073; GFX7-NEXT:    flat_store_dword v[0:1], v2
1074; GFX7-NEXT:    s_endpgm
1075;
1076; GFX10-WGP-LABEL: global_wavefront_release_store:
1077; GFX10-WGP:       ; %bb.0: ; %entry
1078; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
1079; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1080; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1081; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1083; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
1084; GFX10-WGP-NEXT:    s_endpgm
1085;
1086; GFX10-CU-LABEL: global_wavefront_release_store:
1087; GFX10-CU:       ; %bb.0: ; %entry
1088; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
1089; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1090; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1091; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1093; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
1094; GFX10-CU-NEXT:    s_endpgm
1095;
1096; SKIP-CACHE-INV-LABEL: global_wavefront_release_store:
1097; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1098; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
1099; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
1100; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
1101; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1102; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1103; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1104; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1105; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1106; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1107; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1108; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1109; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1111; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1112; SKIP-CACHE-INV-NEXT:    s_endpgm
1113;
1114; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store:
1115; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1116; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
1117; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1118; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1119; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1120; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1121; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
1122; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1123;
1124; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store:
1125; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1126; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
1127; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1128; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1129; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1131; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
1132; GFX90A-TGSPLIT-NEXT:    s_endpgm
1133;
1134; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_store:
1135; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1136; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
1137; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1138; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1139; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1141; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1142; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1143;
1144; GFX940-TGSPLIT-LABEL: global_wavefront_release_store:
1145; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1146; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
1147; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1148; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1149; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1151; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1152; GFX940-TGSPLIT-NEXT:    s_endpgm
1153;
1154; GFX11-WGP-LABEL: global_wavefront_release_store:
1155; GFX11-WGP:       ; %bb.0: ; %entry
1156; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1157; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1158; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1159; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1161; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1162; GFX11-WGP-NEXT:    s_endpgm
1163;
1164; GFX11-CU-LABEL: global_wavefront_release_store:
1165; GFX11-CU:       ; %bb.0: ; %entry
1166; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1167; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1168; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1169; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1171; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1172; GFX11-CU-NEXT:    s_endpgm
1173;
1174; GFX12-WGP-LABEL: global_wavefront_release_store:
1175; GFX12-WGP:       ; %bb.0: ; %entry
1176; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1177; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1178; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1179; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1180; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1181; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1182; GFX12-WGP-NEXT:    s_endpgm
1183;
1184; GFX12-CU-LABEL: global_wavefront_release_store:
1185; GFX12-CU:       ; %bb.0: ; %entry
1186; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1187; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1188; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1189; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1190; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1191; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1192; GFX12-CU-NEXT:    s_endpgm
1193    i32 %in, ptr addrspace(1) %out) {
1194entry:
1195  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
1196  ret void
1197}
1198
1199define amdgpu_kernel void @global_wavefront_seq_cst_store(
1200; GFX6-LABEL: global_wavefront_seq_cst_store:
1201; GFX6:       ; %bb.0: ; %entry
1202; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
1203; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
1204; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
1205; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX6-NEXT:    s_mov_b32 s11, s5
1207; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1208; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1209; GFX6-NEXT:    s_mov_b32 s10, -1
1210; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1211; GFX6-NEXT:    s_mov_b32 s5, s11
1212; GFX6-NEXT:    s_mov_b32 s6, s10
1213; GFX6-NEXT:    s_mov_b32 s7, s9
1214; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1215; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1216; GFX6-NEXT:    s_endpgm
1217;
1218; GFX7-LABEL: global_wavefront_seq_cst_store:
1219; GFX7:       ; %bb.0: ; %entry
1220; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
1221; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
1222; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1223; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1224; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1225; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1226; GFX7-NEXT:    flat_store_dword v[0:1], v2
1227; GFX7-NEXT:    s_endpgm
1228;
1229; GFX10-WGP-LABEL: global_wavefront_seq_cst_store:
1230; GFX10-WGP:       ; %bb.0: ; %entry
1231; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
1232; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1233; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1234; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1235; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1236; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
1237; GFX10-WGP-NEXT:    s_endpgm
1238;
1239; GFX10-CU-LABEL: global_wavefront_seq_cst_store:
1240; GFX10-CU:       ; %bb.0: ; %entry
1241; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
1242; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1243; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1244; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1246; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
1247; GFX10-CU-NEXT:    s_endpgm
1248;
1249; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_store:
1250; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1251; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
1252; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
1253; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
1254; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1255; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1256; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1257; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1258; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1259; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1260; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1261; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1262; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1263; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1264; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1265; SKIP-CACHE-INV-NEXT:    s_endpgm
1266;
1267; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
1268; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1269; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
1270; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1271; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1272; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1274; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
1275; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1276;
1277; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
1278; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1279; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
1280; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
1281; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1282; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1284; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
1285; GFX90A-TGSPLIT-NEXT:    s_endpgm
1286;
1287; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
1288; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1289; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
1290; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1291; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1292; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1294; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1295; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1296;
1297; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
1298; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1299; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
1300; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1301; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1302; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1304; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1305; GFX940-TGSPLIT-NEXT:    s_endpgm
1306;
1307; GFX11-WGP-LABEL: global_wavefront_seq_cst_store:
1308; GFX11-WGP:       ; %bb.0: ; %entry
1309; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1310; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1311; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1312; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1313; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1314; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1315; GFX11-WGP-NEXT:    s_endpgm
1316;
1317; GFX11-CU-LABEL: global_wavefront_seq_cst_store:
1318; GFX11-CU:       ; %bb.0: ; %entry
1319; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1320; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1321; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1322; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1323; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1324; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1325; GFX11-CU-NEXT:    s_endpgm
1326;
1327; GFX12-WGP-LABEL: global_wavefront_seq_cst_store:
1328; GFX12-WGP:       ; %bb.0: ; %entry
1329; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1330; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1331; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1332; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1333; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1334; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1335; GFX12-WGP-NEXT:    s_endpgm
1336;
1337; GFX12-CU-LABEL: global_wavefront_seq_cst_store:
1338; GFX12-CU:       ; %bb.0: ; %entry
1339; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1340; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1341; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1342; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1343; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1344; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1345; GFX12-CU-NEXT:    s_endpgm
1346    i32 %in, ptr addrspace(1) %out) {
1347entry:
1348  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
1349  ret void
1350}
1351
1352define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
1353; GFX6-LABEL: global_wavefront_monotonic_atomicrmw:
1354; GFX6:       ; %bb.0: ; %entry
1355; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1356; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
1357; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX6-NEXT:    s_mov_b32 s11, s5
1359; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1360; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1361; GFX6-NEXT:    s_mov_b32 s10, -1
1362; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1363; GFX6-NEXT:    s_mov_b32 s5, s11
1364; GFX6-NEXT:    s_mov_b32 s6, s10
1365; GFX6-NEXT:    s_mov_b32 s7, s9
1366; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1367; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1368; GFX6-NEXT:    s_endpgm
1369;
1370; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
1371; GFX7:       ; %bb.0: ; %entry
1372; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1373; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1376; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1377; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1378; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1379; GFX7-NEXT:    s_endpgm
1380;
1381; GFX10-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
1382; GFX10-WGP:       ; %bb.0: ; %entry
1383; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1384; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1385; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1386; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1387; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1388; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
1389; GFX10-WGP-NEXT:    s_endpgm
1390;
1391; GFX10-CU-LABEL: global_wavefront_monotonic_atomicrmw:
1392; GFX10-CU:       ; %bb.0: ; %entry
1393; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1394; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1395; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
1396; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1398; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
1399; GFX10-CU-NEXT:    s_endpgm
1400;
1401; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_atomicrmw:
1402; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1403; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1404; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
1405; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1406; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1407; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1408; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1410; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1411; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1412; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1413; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1414; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1415; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1416; SKIP-CACHE-INV-NEXT:    s_endpgm
1417;
1418; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
1419; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1420; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1421; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1422; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1423; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1424; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1425; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1426; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1427;
1428; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
1429; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1430; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1431; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1432; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1433; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1434; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1435; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1436; GFX90A-TGSPLIT-NEXT:    s_endpgm
1437;
1438; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
1439; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1440; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1441; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1442; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1443; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1444; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1445; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1446; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1447;
1448; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
1449; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1450; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1451; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1452; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1453; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1454; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1455; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1456; GFX940-TGSPLIT-NEXT:    s_endpgm
1457;
1458; GFX11-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
1459; GFX11-WGP:       ; %bb.0: ; %entry
1460; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1461; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1462; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1463; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1465; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1466; GFX11-WGP-NEXT:    s_endpgm
1467;
1468; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw:
1469; GFX11-CU:       ; %bb.0: ; %entry
1470; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1471; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1472; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1473; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1475; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1476; GFX11-CU-NEXT:    s_endpgm
1477;
1478; GFX12-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
1479; GFX12-WGP:       ; %bb.0: ; %entry
1480; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1481; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1482; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1483; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1484; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1485; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1486; GFX12-WGP-NEXT:    s_endpgm
1487;
1488; GFX12-CU-LABEL: global_wavefront_monotonic_atomicrmw:
1489; GFX12-CU:       ; %bb.0: ; %entry
1490; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1491; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1492; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1493; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1494; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1495; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1496; GFX12-CU-NEXT:    s_endpgm
1497    ptr addrspace(1) %out, i32 %in) {
1498entry:
1499  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
1500  ret void
1501}
1502
1503define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
1504; GFX6-LABEL: global_wavefront_acquire_atomicrmw:
1505; GFX6:       ; %bb.0: ; %entry
1506; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1507; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
1508; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX6-NEXT:    s_mov_b32 s11, s5
1510; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1511; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1512; GFX6-NEXT:    s_mov_b32 s10, -1
1513; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1514; GFX6-NEXT:    s_mov_b32 s5, s11
1515; GFX6-NEXT:    s_mov_b32 s6, s10
1516; GFX6-NEXT:    s_mov_b32 s7, s9
1517; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1518; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1519; GFX6-NEXT:    s_endpgm
1520;
1521; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
1522; GFX7:       ; %bb.0: ; %entry
1523; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1524; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1525; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1526; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1527; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1528; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1529; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1530; GFX7-NEXT:    s_endpgm
1531;
1532; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw:
1533; GFX10-WGP:       ; %bb.0: ; %entry
1534; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1535; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1536; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1537; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1539; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
1540; GFX10-WGP-NEXT:    s_endpgm
1541;
1542; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw:
1543; GFX10-CU:       ; %bb.0: ; %entry
1544; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1545; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1546; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
1547; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1549; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
1550; GFX10-CU-NEXT:    s_endpgm
1551;
1552; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw:
1553; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1554; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1555; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
1556; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1557; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1558; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1559; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1560; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1561; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1562; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1563; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1564; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1566; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1567; SKIP-CACHE-INV-NEXT:    s_endpgm
1568;
1569; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
1570; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1571; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1572; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1573; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1574; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1576; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1577; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1578;
1579; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
1580; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1581; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1582; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1583; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1584; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1586; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1587; GFX90A-TGSPLIT-NEXT:    s_endpgm
1588;
1589; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
1590; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1591; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1592; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1593; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1594; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1596; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1597; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1598;
1599; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
1600; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1601; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1602; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1603; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1604; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1605; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1606; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1607; GFX940-TGSPLIT-NEXT:    s_endpgm
1608;
1609; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw:
1610; GFX11-WGP:       ; %bb.0: ; %entry
1611; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1612; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1613; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1614; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1615; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1616; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1617; GFX11-WGP-NEXT:    s_endpgm
1618;
1619; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw:
1620; GFX11-CU:       ; %bb.0: ; %entry
1621; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1622; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1623; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1624; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1626; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1627; GFX11-CU-NEXT:    s_endpgm
1628;
1629; GFX12-WGP-LABEL: global_wavefront_acquire_atomicrmw:
1630; GFX12-WGP:       ; %bb.0: ; %entry
1631; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1632; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1633; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1634; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1635; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1636; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1637; GFX12-WGP-NEXT:    s_endpgm
1638;
1639; GFX12-CU-LABEL: global_wavefront_acquire_atomicrmw:
1640; GFX12-CU:       ; %bb.0: ; %entry
1641; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1642; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1643; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1644; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1645; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1646; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1647; GFX12-CU-NEXT:    s_endpgm
1648    ptr addrspace(1) %out, i32 %in) {
1649entry:
1650  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
1651  ret void
1652}
1653
1654define amdgpu_kernel void @global_wavefront_release_atomicrmw(
1655; GFX6-LABEL: global_wavefront_release_atomicrmw:
1656; GFX6:       ; %bb.0: ; %entry
1657; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1658; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
1659; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1660; GFX6-NEXT:    s_mov_b32 s11, s5
1661; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1662; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1663; GFX6-NEXT:    s_mov_b32 s10, -1
1664; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1665; GFX6-NEXT:    s_mov_b32 s5, s11
1666; GFX6-NEXT:    s_mov_b32 s6, s10
1667; GFX6-NEXT:    s_mov_b32 s7, s9
1668; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1669; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1670; GFX6-NEXT:    s_endpgm
1671;
1672; GFX7-LABEL: global_wavefront_release_atomicrmw:
1673; GFX7:       ; %bb.0: ; %entry
1674; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1675; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1676; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1677; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1678; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1679; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1680; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1681; GFX7-NEXT:    s_endpgm
1682;
1683; GFX10-WGP-LABEL: global_wavefront_release_atomicrmw:
1684; GFX10-WGP:       ; %bb.0: ; %entry
1685; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1686; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1687; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1688; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1690; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
1691; GFX10-WGP-NEXT:    s_endpgm
1692;
1693; GFX10-CU-LABEL: global_wavefront_release_atomicrmw:
1694; GFX10-CU:       ; %bb.0: ; %entry
1695; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1696; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1697; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
1698; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1700; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
1701; GFX10-CU-NEXT:    s_endpgm
1702;
1703; SKIP-CACHE-INV-LABEL: global_wavefront_release_atomicrmw:
1704; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1705; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1706; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
1707; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1708; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1709; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1710; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1711; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1712; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1713; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1714; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1715; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1716; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1717; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1718; SKIP-CACHE-INV-NEXT:    s_endpgm
1719;
1720; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
1721; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1722; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1723; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1724; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1725; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1727; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1728; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1729;
1730; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
1731; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1732; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1733; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1734; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1735; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1737; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1738; GFX90A-TGSPLIT-NEXT:    s_endpgm
1739;
1740; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
1741; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1742; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1743; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1744; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1745; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1747; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1748; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1749;
1750; GFX940-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
1751; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1752; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1753; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1754; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1755; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1756; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1757; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1758; GFX940-TGSPLIT-NEXT:    s_endpgm
1759;
1760; GFX11-WGP-LABEL: global_wavefront_release_atomicrmw:
1761; GFX11-WGP:       ; %bb.0: ; %entry
1762; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1763; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1764; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1765; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1767; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1768; GFX11-WGP-NEXT:    s_endpgm
1769;
1770; GFX11-CU-LABEL: global_wavefront_release_atomicrmw:
1771; GFX11-CU:       ; %bb.0: ; %entry
1772; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1773; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1774; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1775; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1777; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1778; GFX11-CU-NEXT:    s_endpgm
1779;
1780; GFX12-WGP-LABEL: global_wavefront_release_atomicrmw:
1781; GFX12-WGP:       ; %bb.0: ; %entry
1782; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1783; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1784; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1785; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1786; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1787; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1788; GFX12-WGP-NEXT:    s_endpgm
1789;
1790; GFX12-CU-LABEL: global_wavefront_release_atomicrmw:
1791; GFX12-CU:       ; %bb.0: ; %entry
1792; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1793; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1794; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1795; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1796; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1797; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1798; GFX12-CU-NEXT:    s_endpgm
1799    ptr addrspace(1) %out, i32 %in) {
1800entry:
1801  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
1802  ret void
1803}
1804
1805define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
1806; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw:
1807; GFX6:       ; %bb.0: ; %entry
1808; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1809; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
1810; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1811; GFX6-NEXT:    s_mov_b32 s11, s5
1812; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1813; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1814; GFX6-NEXT:    s_mov_b32 s10, -1
1815; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1816; GFX6-NEXT:    s_mov_b32 s5, s11
1817; GFX6-NEXT:    s_mov_b32 s6, s10
1818; GFX6-NEXT:    s_mov_b32 s7, s9
1819; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1820; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1821; GFX6-NEXT:    s_endpgm
1822;
1823; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
1824; GFX7:       ; %bb.0: ; %entry
1825; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1826; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1827; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1828; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1829; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1830; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1831; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1832; GFX7-NEXT:    s_endpgm
1833;
1834; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
1835; GFX10-WGP:       ; %bb.0: ; %entry
1836; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1837; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1838; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1839; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1840; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1841; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
1842; GFX10-WGP-NEXT:    s_endpgm
1843;
1844; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
1845; GFX10-CU:       ; %bb.0: ; %entry
1846; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1847; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1848; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
1849; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
1851; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
1852; GFX10-CU-NEXT:    s_endpgm
1853;
1854; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw:
1855; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1856; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1857; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
1858; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1859; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
1860; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1861; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
1862; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1863; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1864; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
1865; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
1866; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
1867; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1868; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
1869; SKIP-CACHE-INV-NEXT:    s_endpgm
1870;
1871; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
1872; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
1873; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1874; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1875; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1876; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1878; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1879; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
1880;
1881; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
1882; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
1883; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1884; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1885; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
1886; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
1888; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
1889; GFX90A-TGSPLIT-NEXT:    s_endpgm
1890;
1891; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
1892; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
1893; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1894; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1895; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1896; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1898; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1899; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
1900;
1901; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
1902; GFX940-TGSPLIT:       ; %bb.0: ; %entry
1903; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
1904; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1905; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
1906; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
1907; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
1908; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
1909; GFX940-TGSPLIT-NEXT:    s_endpgm
1910;
1911; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
1912; GFX11-WGP:       ; %bb.0: ; %entry
1913; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1914; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1915; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1916; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1917; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
1918; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1919; GFX11-WGP-NEXT:    s_endpgm
1920;
1921; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
1922; GFX11-CU:       ; %bb.0: ; %entry
1923; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1924; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1925; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1926; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
1928; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1929; GFX11-CU-NEXT:    s_endpgm
1930;
1931; GFX12-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
1932; GFX12-WGP:       ; %bb.0: ; %entry
1933; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1934; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1935; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
1936; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1937; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
1938; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1939; GFX12-WGP-NEXT:    s_endpgm
1940;
1941; GFX12-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
1942; GFX12-CU:       ; %bb.0: ; %entry
1943; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1944; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1945; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
1946; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1947; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
1948; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
1949; GFX12-CU-NEXT:    s_endpgm
1950    ptr addrspace(1) %out, i32 %in) {
1951entry:
1952  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
1953  ret void
1954}
1955
1956define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
1957; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw:
1958; GFX6:       ; %bb.0: ; %entry
1959; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1960; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
1961; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX6-NEXT:    s_mov_b32 s11, s5
1963; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
1964; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
1965; GFX6-NEXT:    s_mov_b32 s10, -1
1966; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
1967; GFX6-NEXT:    s_mov_b32 s5, s11
1968; GFX6-NEXT:    s_mov_b32 s6, s10
1969; GFX6-NEXT:    s_mov_b32 s7, s9
1970; GFX6-NEXT:    v_mov_b32_e32 v0, s8
1971; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
1972; GFX6-NEXT:    s_endpgm
1973;
1974; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
1975; GFX7:       ; %bb.0: ; %entry
1976; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
1977; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
1978; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX7-NEXT:    v_mov_b32_e32 v0, s6
1980; GFX7-NEXT:    v_mov_b32_e32 v1, s7
1981; GFX7-NEXT:    v_mov_b32_e32 v2, s4
1982; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
1983; GFX7-NEXT:    s_endpgm
1984;
1985; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
1986; GFX10-WGP:       ; %bb.0: ; %entry
1987; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1988; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1989; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
1990; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
1992; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
1993; GFX10-WGP-NEXT:    s_endpgm
1994;
1995; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
1996; GFX10-CU:       ; %bb.0: ; %entry
1997; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1998; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1999; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2000; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
2002; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
2003; GFX10-CU-NEXT:    s_endpgm
2004;
2005; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw:
2006; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2007; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2008; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
2009; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2010; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
2011; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2012; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
2013; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2014; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2015; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
2016; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
2017; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
2018; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2019; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
2020; SKIP-CACHE-INV-NEXT:    s_endpgm
2021;
2022; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
2023; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2024; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2025; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2026; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2027; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2029; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
2030; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2031;
2032; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
2033; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2034; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2035; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2036; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2037; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2038; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2039; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
2040; GFX90A-TGSPLIT-NEXT:    s_endpgm
2041;
2042; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
2043; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2044; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2045; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2046; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2047; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2048; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2049; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
2050; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2051;
2052; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
2053; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2054; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2055; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2056; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2057; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2059; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
2060; GFX940-TGSPLIT-NEXT:    s_endpgm
2061;
2062; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
2063; GFX11-WGP:       ; %bb.0: ; %entry
2064; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
2065; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2066; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2067; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2068; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
2069; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
2070; GFX11-WGP-NEXT:    s_endpgm
2071;
2072; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
2073; GFX11-CU:       ; %bb.0: ; %entry
2074; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
2075; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2076; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2077; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2078; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
2079; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
2080; GFX11-CU-NEXT:    s_endpgm
2081;
2082; GFX12-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
2083; GFX12-WGP:       ; %bb.0: ; %entry
2084; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
2085; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2086; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2087; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2088; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
2089; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
2090; GFX12-WGP-NEXT:    s_endpgm
2091;
2092; GFX12-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
2093; GFX12-CU:       ; %bb.0: ; %entry
2094; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
2095; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2096; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2097; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2098; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
2099; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
2100; GFX12-CU-NEXT:    s_endpgm
2101    ptr addrspace(1) %out, i32 %in) {
2102entry:
2103  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
2104  ret void
2105}
2106
2107define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
2108; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw:
2109; GFX6:       ; %bb.0: ; %entry
2110; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2111; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
2112; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2113; GFX6-NEXT:    s_mov_b32 s11, s5
2114; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
2115; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
2116; GFX6-NEXT:    s_mov_b32 s10, -1
2117; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
2118; GFX6-NEXT:    s_mov_b32 s5, s11
2119; GFX6-NEXT:    s_mov_b32 s6, s10
2120; GFX6-NEXT:    s_mov_b32 s7, s9
2121; GFX6-NEXT:    v_mov_b32_e32 v0, s8
2122; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
2123; GFX6-NEXT:    s_waitcnt vmcnt(0)
2124; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2125; GFX6-NEXT:    s_endpgm
2126;
2127; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
2128; GFX7:       ; %bb.0: ; %entry
2129; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2130; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2131; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2133; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2134; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2135; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2136; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2137; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2138; GFX7-NEXT:    s_waitcnt vmcnt(0)
2139; GFX7-NEXT:    flat_store_dword v[0:1], v2
2140; GFX7-NEXT:    s_endpgm
2141;
2142; GFX10-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
2143; GFX10-WGP:       ; %bb.0: ; %entry
2144; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2145; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2146; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2147; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
2149; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2150; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2151; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
2152; GFX10-WGP-NEXT:    s_endpgm
2153;
2154; GFX10-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
2155; GFX10-CU:       ; %bb.0: ; %entry
2156; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2157; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2158; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2159; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
2161; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2162; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2163; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
2164; GFX10-CU-NEXT:    s_endpgm
2165;
2166; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_ret_atomicrmw:
2167; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2168; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2169; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
2170; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2171; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
2172; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2173; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
2174; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2175; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2176; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
2177; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
2178; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
2179; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2180; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
2181; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2182; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2183; SKIP-CACHE-INV-NEXT:    s_endpgm
2184;
2185; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
2186; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2187; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2188; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2189; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2190; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2191; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2192; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2193; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2194; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2195; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2196;
2197; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
2198; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2199; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2200; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2201; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2202; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2204; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2205; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2206; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2207; GFX90A-TGSPLIT-NEXT:    s_endpgm
2208;
2209; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
2210; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2211; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2212; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2213; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2214; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2215; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2216; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2217; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2218; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2219; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2220;
2221; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
2222; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2223; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2224; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2225; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2226; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2227; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2228; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2229; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2230; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2231; GFX940-TGSPLIT-NEXT:    s_endpgm
2232;
2233; GFX11-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
2234; GFX11-WGP:       ; %bb.0: ; %entry
2235; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
2236; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2237; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2238; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
2240; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2241; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
2242; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2243; GFX11-WGP-NEXT:    s_endpgm
2244;
2245; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
2246; GFX11-CU:       ; %bb.0: ; %entry
2247; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
2248; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2249; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2250; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
2252; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2253; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2254; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2255; GFX11-CU-NEXT:    s_endpgm
2256;
2257; GFX12-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
2258; GFX12-WGP:       ; %bb.0: ; %entry
2259; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
2260; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2261; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2262; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2263; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
2264; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2265; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
2266; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2267; GFX12-WGP-NEXT:    s_endpgm
2268;
2269; GFX12-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
2270; GFX12-CU:       ; %bb.0: ; %entry
2271; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
2272; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2273; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2274; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2275; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
2276; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2277; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2278; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2279; GFX12-CU-NEXT:    s_endpgm
2280    ptr addrspace(1) %out, i32 %in) {
2281entry:
2282  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
2283  store i32 %val, ptr addrspace(1) %out, align 4
2284  ret void
2285}
2286
2287define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
2288; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2289; GFX6:       ; %bb.0: ; %entry
2290; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2291; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
2292; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2293; GFX6-NEXT:    s_mov_b32 s11, s5
2294; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
2295; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
2296; GFX6-NEXT:    s_mov_b32 s10, -1
2297; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
2298; GFX6-NEXT:    s_mov_b32 s5, s11
2299; GFX6-NEXT:    s_mov_b32 s6, s10
2300; GFX6-NEXT:    s_mov_b32 s7, s9
2301; GFX6-NEXT:    v_mov_b32_e32 v0, s8
2302; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
2303; GFX6-NEXT:    s_waitcnt vmcnt(0)
2304; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2305; GFX6-NEXT:    s_endpgm
2306;
2307; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2308; GFX7:       ; %bb.0: ; %entry
2309; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2310; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2311; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2313; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2314; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2315; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2316; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2317; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2318; GFX7-NEXT:    s_waitcnt vmcnt(0)
2319; GFX7-NEXT:    flat_store_dword v[0:1], v2
2320; GFX7-NEXT:    s_endpgm
2321;
2322; GFX10-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2323; GFX10-WGP:       ; %bb.0: ; %entry
2324; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2325; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2326; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2327; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2328; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
2329; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2330; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2331; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
2332; GFX10-WGP-NEXT:    s_endpgm
2333;
2334; GFX10-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2335; GFX10-CU:       ; %bb.0: ; %entry
2336; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2337; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2338; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2339; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2340; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
2341; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2342; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2343; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
2344; GFX10-CU-NEXT:    s_endpgm
2345;
2346; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2347; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2348; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2349; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
2350; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2351; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
2352; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2353; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
2354; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2355; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2356; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
2357; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
2358; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
2359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2360; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
2361; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2362; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2363; SKIP-CACHE-INV-NEXT:    s_endpgm
2364;
2365; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2366; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2367; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2368; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2369; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2370; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2372; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2373; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2374; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2375; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2376;
2377; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2378; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2379; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2380; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2381; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2382; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2383; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2384; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2385; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2386; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2387; GFX90A-TGSPLIT-NEXT:    s_endpgm
2388;
2389; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2390; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2391; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2392; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2393; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2394; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2395; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2396; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2397; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2398; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2399; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2400;
2401; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2402; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2403; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2404; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2405; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2406; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2407; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2408; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2409; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2410; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2411; GFX940-TGSPLIT-NEXT:    s_endpgm
2412;
2413; GFX11-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2414; GFX11-WGP:       ; %bb.0: ; %entry
2415; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
2416; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2417; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2418; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2419; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
2420; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2421; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
2422; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2423; GFX11-WGP-NEXT:    s_endpgm
2424;
2425; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2426; GFX11-CU:       ; %bb.0: ; %entry
2427; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
2428; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2429; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2430; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
2432; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2433; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2434; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2435; GFX11-CU-NEXT:    s_endpgm
2436;
2437; GFX12-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2438; GFX12-WGP:       ; %bb.0: ; %entry
2439; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
2440; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2441; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2442; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2443; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
2444; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2445; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
2446; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2447; GFX12-WGP-NEXT:    s_endpgm
2448;
2449; GFX12-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
2450; GFX12-CU:       ; %bb.0: ; %entry
2451; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
2452; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2453; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2454; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2455; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
2456; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2457; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2458; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2459; GFX12-CU-NEXT:    s_endpgm
2460    ptr addrspace(1) %out, i32 %in) {
2461entry:
2462  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
2463  store i32 %val, ptr addrspace(1) %out, align 4
2464  ret void
2465}
2466
2467define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
2468; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2469; GFX6:       ; %bb.0: ; %entry
2470; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2471; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
2472; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2473; GFX6-NEXT:    s_mov_b32 s11, s5
2474; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
2475; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
2476; GFX6-NEXT:    s_mov_b32 s10, -1
2477; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
2478; GFX6-NEXT:    s_mov_b32 s5, s11
2479; GFX6-NEXT:    s_mov_b32 s6, s10
2480; GFX6-NEXT:    s_mov_b32 s7, s9
2481; GFX6-NEXT:    v_mov_b32_e32 v0, s8
2482; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
2483; GFX6-NEXT:    s_waitcnt vmcnt(0)
2484; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2485; GFX6-NEXT:    s_endpgm
2486;
2487; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2488; GFX7:       ; %bb.0: ; %entry
2489; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2490; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
2491; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2492; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2493; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2494; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2495; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
2496; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2497; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2498; GFX7-NEXT:    s_waitcnt vmcnt(0)
2499; GFX7-NEXT:    flat_store_dword v[0:1], v2
2500; GFX7-NEXT:    s_endpgm
2501;
2502; GFX10-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2503; GFX10-WGP:       ; %bb.0: ; %entry
2504; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2505; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2506; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
2507; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2508; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
2509; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2510; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2511; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
2512; GFX10-WGP-NEXT:    s_endpgm
2513;
2514; GFX10-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2515; GFX10-CU:       ; %bb.0: ; %entry
2516; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2517; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2518; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
2519; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2520; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
2521; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2522; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2523; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
2524; GFX10-CU-NEXT:    s_endpgm
2525;
2526; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2527; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2528; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2529; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
2530; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2531; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
2532; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2533; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
2534; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2535; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2536; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
2537; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
2538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
2539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2540; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
2541; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2542; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2543; SKIP-CACHE-INV-NEXT:    s_endpgm
2544;
2545; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2546; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2547; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2548; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2549; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2550; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2551; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2552; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2553; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2554; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2555; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2556;
2557; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2558; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2559; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2560; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2561; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
2562; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2563; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2564; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
2565; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2566; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
2567; GFX90A-TGSPLIT-NEXT:    s_endpgm
2568;
2569; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2570; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2571; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2572; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2573; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2574; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2575; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2576; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2577; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2578; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2579; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2580;
2581; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2582; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2583; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2584; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2585; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
2586; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2587; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2588; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
2589; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
2590; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
2591; GFX940-TGSPLIT-NEXT:    s_endpgm
2592;
2593; GFX11-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2594; GFX11-WGP:       ; %bb.0: ; %entry
2595; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
2596; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2597; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2598; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2599; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
2600; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2601; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
2602; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2603; GFX11-WGP-NEXT:    s_endpgm
2604;
2605; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2606; GFX11-CU:       ; %bb.0: ; %entry
2607; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
2608; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2609; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2610; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2611; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
2612; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
2613; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
2614; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2615; GFX11-CU-NEXT:    s_endpgm
2616;
2617; GFX12-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2618; GFX12-WGP:       ; %bb.0: ; %entry
2619; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
2620; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2621; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
2622; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2623; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
2624; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2625; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
2626; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
2627; GFX12-WGP-NEXT:    s_endpgm
2628;
2629; GFX12-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
2630; GFX12-CU:       ; %bb.0: ; %entry
2631; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
2632; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2633; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
2634; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2635; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
2636; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
2637; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
2638; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
2639; GFX12-CU-NEXT:    s_endpgm
2640    ptr addrspace(1) %out, i32 %in) {
2641entry:
2642  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
2643  store i32 %val, ptr addrspace(1) %out, align 4
2644  ret void
2645}
2646
2647define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
2648; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2649; GFX6:       ; %bb.0: ; %entry
2650; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
2651; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
2652; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
2653; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
2654; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2655; GFX6-NEXT:    s_mov_b32 s12, s5
2656; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
2657; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
2658; GFX6-NEXT:    s_mov_b32 s11, -1
2659; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
2660; GFX6-NEXT:    s_mov_b32 s5, s12
2661; GFX6-NEXT:    s_mov_b32 s6, s11
2662; GFX6-NEXT:    s_mov_b32 s7, s10
2663; GFX6-NEXT:    v_mov_b32_e32 v0, s9
2664; GFX6-NEXT:    v_mov_b32_e32 v2, s8
2665; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
2666; GFX6-NEXT:    v_mov_b32_e32 v1, v2
2667; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2668; GFX6-NEXT:    s_endpgm
2669;
2670; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2671; GFX7:       ; %bb.0: ; %entry
2672; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2673; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2674; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2675; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2676; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2677; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2678; GFX7-NEXT:    s_mov_b32 s4, s8
2679; GFX7-NEXT:    s_mov_b32 s5, s9
2680; GFX7-NEXT:    s_mov_b32 s9, s10
2681; GFX7-NEXT:    s_mov_b32 s8, s11
2682; GFX7-NEXT:    s_add_u32 s4, s4, s9
2683; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2684; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2685; GFX7-NEXT:    s_mov_b32 s5, s8
2686; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2687; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2688; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2689; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2690; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2691; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2692; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2693; GFX7-NEXT:    s_endpgm
2694;
2695; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2696; GFX10-WGP:       ; %bb.0: ; %entry
2697; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2698; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2699; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
2700; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
2701; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2702; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
2703; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
2704; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2705; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
2706; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
2707; GFX10-WGP-NEXT:    s_endpgm
2708;
2709; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2710; GFX10-CU:       ; %bb.0: ; %entry
2711; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2712; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2713; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
2714; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
2715; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2716; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
2717; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
2718; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2719; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
2720; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
2721; GFX10-CU-NEXT:    s_endpgm
2722;
2723; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2724; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2725; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
2726; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
2727; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
2728; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
2729; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2730; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
2731; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2732; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
2733; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
2734; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2735; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
2736; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
2737; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
2738; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
2739; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
2740; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
2741; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
2742; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2743; SKIP-CACHE-INV-NEXT:    s_endpgm
2744;
2745; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2746; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2747; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2748; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2749; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2750; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2751; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2752; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2753; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2754; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2755; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2756; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
2757; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2758;
2759; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2760; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2761; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2762; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2763; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2764; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2765; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2767; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2768; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2769; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2770; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
2771; GFX90A-TGSPLIT-NEXT:    s_endpgm
2772;
2773; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2774; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2775; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2776; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2777; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2778; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2779; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2780; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2781; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2782; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2783; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2784; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
2785; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
2786;
2787; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2788; GFX940-TGSPLIT:       ; %bb.0: ; %entry
2789; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2790; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2791; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2792; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2793; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2794; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2795; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2796; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2797; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2798; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
2799; GFX940-TGSPLIT-NEXT:    s_endpgm
2800;
2801; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2802; GFX11-WGP:       ; %bb.0: ; %entry
2803; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
2804; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2805; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2806; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2807; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2808; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
2809; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
2810; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2811; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
2812; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
2813; GFX11-WGP-NEXT:    s_endpgm
2814;
2815; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2816; GFX11-CU:       ; %bb.0: ; %entry
2817; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
2818; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2819; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2820; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2821; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
2822; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
2823; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
2824; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2825; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
2826; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
2827; GFX11-CU-NEXT:    s_endpgm
2828;
2829; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2830; GFX12-WGP:       ; %bb.0: ; %entry
2831; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
2832; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2833; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
2834; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
2835; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
2836; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
2837; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
2838; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2839; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
2840; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
2841; GFX12-WGP-NEXT:    s_endpgm
2842;
2843; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
2844; GFX12-CU:       ; %bb.0: ; %entry
2845; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
2846; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2847; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
2848; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
2849; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
2850; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
2851; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
2852; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2853; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
2854; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
2855; GFX12-CU-NEXT:    s_endpgm
2856    ptr addrspace(1) %out, i32 %in, i32 %old) {
2857entry:
2858  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2859  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
2860  ret void
2861}
2862
2863define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
2864; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2865; GFX6:       ; %bb.0: ; %entry
2866; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
2867; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
2868; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
2869; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
2870; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2871; GFX6-NEXT:    s_mov_b32 s12, s5
2872; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
2873; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
2874; GFX6-NEXT:    s_mov_b32 s11, -1
2875; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
2876; GFX6-NEXT:    s_mov_b32 s5, s12
2877; GFX6-NEXT:    s_mov_b32 s6, s11
2878; GFX6-NEXT:    s_mov_b32 s7, s10
2879; GFX6-NEXT:    v_mov_b32_e32 v0, s9
2880; GFX6-NEXT:    v_mov_b32_e32 v2, s8
2881; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
2882; GFX6-NEXT:    v_mov_b32_e32 v1, v2
2883; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2884; GFX6-NEXT:    s_endpgm
2885;
2886; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2887; GFX7:       ; %bb.0: ; %entry
2888; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2889; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
2890; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
2891; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
2892; GFX7-NEXT:    s_mov_b64 s[10:11], 16
2893; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2894; GFX7-NEXT:    s_mov_b32 s4, s8
2895; GFX7-NEXT:    s_mov_b32 s5, s9
2896; GFX7-NEXT:    s_mov_b32 s9, s10
2897; GFX7-NEXT:    s_mov_b32 s8, s11
2898; GFX7-NEXT:    s_add_u32 s4, s4, s9
2899; GFX7-NEXT:    s_addc_u32 s8, s5, s8
2900; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2901; GFX7-NEXT:    s_mov_b32 s5, s8
2902; GFX7-NEXT:    v_mov_b32_e32 v2, s7
2903; GFX7-NEXT:    v_mov_b32_e32 v0, s6
2904; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2905; GFX7-NEXT:    v_mov_b32_e32 v3, v0
2906; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2907; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2908; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
2909; GFX7-NEXT:    s_endpgm
2910;
2911; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2912; GFX10-WGP:       ; %bb.0: ; %entry
2913; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2914; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2915; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
2916; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
2917; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2918; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
2919; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
2920; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2921; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
2922; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
2923; GFX10-WGP-NEXT:    s_endpgm
2924;
2925; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2926; GFX10-CU:       ; %bb.0: ; %entry
2927; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2928; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2929; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
2930; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
2931; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2932; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
2933; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
2934; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
2935; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
2936; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
2937; GFX10-CU-NEXT:    s_endpgm
2938;
2939; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2940; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2941; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
2942; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
2943; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
2944; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
2945; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2946; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
2947; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
2948; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
2949; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
2950; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
2951; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
2952; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
2953; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
2954; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
2955; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
2956; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
2957; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
2958; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
2959; SKIP-CACHE-INV-NEXT:    s_endpgm
2960;
2961; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2962; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
2963; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2964; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2965; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2966; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2967; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2968; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2969; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2970; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2971; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2972; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
2973; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
2974;
2975; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2976; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
2977; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2978; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2979; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
2980; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
2981; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2982; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
2983; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
2984; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2985; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
2986; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
2987; GFX90A-TGSPLIT-NEXT:    s_endpgm
2988;
2989; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
2990; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
2991; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
2992; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2993; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
2994; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
2995; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
2996; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
2997; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
2998; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2999; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3000; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3001; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3002;
3003; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
3004; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3005; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3006; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3007; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3008; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3009; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3010; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3011; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3012; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3013; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3014; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3015; GFX940-TGSPLIT-NEXT:    s_endpgm
3016;
3017; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
3018; GFX11-WGP:       ; %bb.0: ; %entry
3019; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
3020; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3021; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3022; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3023; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3024; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
3025; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
3026; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3027; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
3028; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3029; GFX11-WGP-NEXT:    s_endpgm
3030;
3031; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
3032; GFX11-CU:       ; %bb.0: ; %entry
3033; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
3034; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3035; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3036; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3037; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3038; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
3039; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
3040; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3041; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
3042; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3043; GFX11-CU-NEXT:    s_endpgm
3044;
3045; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
3046; GFX12-WGP:       ; %bb.0: ; %entry
3047; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
3048; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3049; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3050; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3051; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3052; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
3053; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
3054; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3055; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
3056; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3057; GFX12-WGP-NEXT:    s_endpgm
3058;
3059; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
3060; GFX12-CU:       ; %bb.0: ; %entry
3061; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
3062; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3063; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3064; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3065; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3066; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
3067; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
3068; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3069; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
3070; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3071; GFX12-CU-NEXT:    s_endpgm
3072    ptr addrspace(1) %out, i32 %in, i32 %old) {
3073entry:
3074  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3075  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
3076  ret void
3077}
3078
3079define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
3080; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg:
3081; GFX6:       ; %bb.0: ; %entry
3082; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
3083; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
3084; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
3085; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
3086; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3087; GFX6-NEXT:    s_mov_b32 s12, s5
3088; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
3089; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
3090; GFX6-NEXT:    s_mov_b32 s11, -1
3091; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
3092; GFX6-NEXT:    s_mov_b32 s5, s12
3093; GFX6-NEXT:    s_mov_b32 s6, s11
3094; GFX6-NEXT:    s_mov_b32 s7, s10
3095; GFX6-NEXT:    v_mov_b32_e32 v0, s9
3096; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3097; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3098; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3099; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3100; GFX6-NEXT:    s_endpgm
3101;
3102; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
3103; GFX7:       ; %bb.0: ; %entry
3104; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3105; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3106; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3107; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3108; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3109; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX7-NEXT:    s_mov_b32 s4, s8
3111; GFX7-NEXT:    s_mov_b32 s5, s9
3112; GFX7-NEXT:    s_mov_b32 s9, s10
3113; GFX7-NEXT:    s_mov_b32 s8, s11
3114; GFX7-NEXT:    s_add_u32 s4, s4, s9
3115; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3116; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3117; GFX7-NEXT:    s_mov_b32 s5, s8
3118; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3119; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3120; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3121; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3122; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3123; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3124; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3125; GFX7-NEXT:    s_endpgm
3126;
3127; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
3128; GFX10-WGP:       ; %bb.0: ; %entry
3129; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3130; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3131; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
3132; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
3133; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3134; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
3135; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
3136; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3137; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
3138; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3139; GFX10-WGP-NEXT:    s_endpgm
3140;
3141; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
3142; GFX10-CU:       ; %bb.0: ; %entry
3143; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3144; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3145; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
3146; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
3147; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3148; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
3149; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
3150; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3151; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
3152; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3153; GFX10-CU-NEXT:    s_endpgm
3154;
3155; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg:
3156; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3157; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
3158; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
3159; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
3160; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
3161; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3162; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
3163; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
3164; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
3165; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
3166; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
3167; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
3168; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
3169; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
3170; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
3171; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
3172; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3173; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
3174; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
3175; SKIP-CACHE-INV-NEXT:    s_endpgm
3176;
3177; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
3178; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3179; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3180; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3181; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3182; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3183; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3184; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3185; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3186; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3187; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3188; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3189; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3190;
3191; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
3192; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3193; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3194; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3195; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3196; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3197; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3198; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3199; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3200; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3201; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3202; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3203; GFX90A-TGSPLIT-NEXT:    s_endpgm
3204;
3205; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
3206; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3207; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3208; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3209; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3210; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3211; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3212; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3213; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3214; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3215; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3216; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3217; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3218;
3219; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
3220; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3221; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3222; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3223; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3224; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3225; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3226; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3227; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3228; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3229; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3230; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3231; GFX940-TGSPLIT-NEXT:    s_endpgm
3232;
3233; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
3234; GFX11-WGP:       ; %bb.0: ; %entry
3235; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
3236; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3237; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3238; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3239; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3240; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
3241; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
3242; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3243; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
3244; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3245; GFX11-WGP-NEXT:    s_endpgm
3246;
3247; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
3248; GFX11-CU:       ; %bb.0: ; %entry
3249; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
3250; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3251; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3252; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3253; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3254; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
3255; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
3256; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3257; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
3258; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3259; GFX11-CU-NEXT:    s_endpgm
3260;
3261; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
3262; GFX12-WGP:       ; %bb.0: ; %entry
3263; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
3264; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3265; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3266; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3267; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3268; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
3269; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
3270; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3271; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
3272; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3273; GFX12-WGP-NEXT:    s_endpgm
3274;
3275; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
3276; GFX12-CU:       ; %bb.0: ; %entry
3277; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
3278; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3279; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3280; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3281; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3282; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
3283; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
3284; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3285; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
3286; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3287; GFX12-CU-NEXT:    s_endpgm
3288    ptr addrspace(1) %out, i32 %in, i32 %old) {
3289entry:
3290  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3291  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
3292  ret void
3293}
3294
3295define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
3296; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3297; GFX6:       ; %bb.0: ; %entry
3298; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
3299; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
3300; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
3301; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
3302; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3303; GFX6-NEXT:    s_mov_b32 s12, s5
3304; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
3305; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
3306; GFX6-NEXT:    s_mov_b32 s11, -1
3307; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
3308; GFX6-NEXT:    s_mov_b32 s5, s12
3309; GFX6-NEXT:    s_mov_b32 s6, s11
3310; GFX6-NEXT:    s_mov_b32 s7, s10
3311; GFX6-NEXT:    v_mov_b32_e32 v0, s9
3312; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3313; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3314; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3315; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3316; GFX6-NEXT:    s_endpgm
3317;
3318; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3319; GFX7:       ; %bb.0: ; %entry
3320; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3321; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3322; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3323; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3324; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3325; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3326; GFX7-NEXT:    s_mov_b32 s4, s8
3327; GFX7-NEXT:    s_mov_b32 s5, s9
3328; GFX7-NEXT:    s_mov_b32 s9, s10
3329; GFX7-NEXT:    s_mov_b32 s8, s11
3330; GFX7-NEXT:    s_add_u32 s4, s4, s9
3331; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3332; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3333; GFX7-NEXT:    s_mov_b32 s5, s8
3334; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3335; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3336; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3337; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3338; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3339; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3340; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3341; GFX7-NEXT:    s_endpgm
3342;
3343; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3344; GFX10-WGP:       ; %bb.0: ; %entry
3345; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3346; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3347; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
3348; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
3349; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3350; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
3351; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
3352; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3353; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
3354; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3355; GFX10-WGP-NEXT:    s_endpgm
3356;
3357; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3358; GFX10-CU:       ; %bb.0: ; %entry
3359; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3360; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3361; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
3362; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
3363; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3364; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
3365; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
3366; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3367; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
3368; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3369; GFX10-CU-NEXT:    s_endpgm
3370;
3371; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3372; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3373; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
3374; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
3375; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
3376; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
3377; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3378; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
3379; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
3380; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
3381; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
3382; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
3383; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
3384; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
3385; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
3386; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
3387; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
3388; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3389; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
3390; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
3391; SKIP-CACHE-INV-NEXT:    s_endpgm
3392;
3393; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3394; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3395; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3396; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3397; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3398; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3399; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3400; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3401; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3402; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3403; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3404; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3405; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3406;
3407; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3408; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3409; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3410; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3411; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3412; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3413; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3414; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3415; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3416; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3417; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3418; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3419; GFX90A-TGSPLIT-NEXT:    s_endpgm
3420;
3421; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3422; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3423; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3424; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3425; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3426; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3427; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3428; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3429; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3430; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3431; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3432; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3433; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3434;
3435; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3436; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3437; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3438; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3439; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3440; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3441; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3442; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3443; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3444; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3445; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3446; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3447; GFX940-TGSPLIT-NEXT:    s_endpgm
3448;
3449; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3450; GFX11-WGP:       ; %bb.0: ; %entry
3451; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
3452; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3453; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3454; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3455; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3456; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
3457; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
3458; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3459; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
3460; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3461; GFX11-WGP-NEXT:    s_endpgm
3462;
3463; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3464; GFX11-CU:       ; %bb.0: ; %entry
3465; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
3466; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3467; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3468; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3469; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3470; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
3471; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
3472; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3473; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
3474; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3475; GFX11-CU-NEXT:    s_endpgm
3476;
3477; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3478; GFX12-WGP:       ; %bb.0: ; %entry
3479; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
3480; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3481; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3482; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3483; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3484; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
3485; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
3486; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3487; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
3488; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3489; GFX12-WGP-NEXT:    s_endpgm
3490;
3491; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
3492; GFX12-CU:       ; %bb.0: ; %entry
3493; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
3494; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3495; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3496; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3497; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3498; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
3499; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
3500; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3501; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
3502; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3503; GFX12-CU-NEXT:    s_endpgm
3504    ptr addrspace(1) %out, i32 %in, i32 %old) {
3505entry:
3506  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3507  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
3508  ret void
3509}
3510
3511define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
3512; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3513; GFX6:       ; %bb.0: ; %entry
3514; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
3515; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
3516; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
3517; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
3518; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX6-NEXT:    s_mov_b32 s12, s5
3520; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
3521; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
3522; GFX6-NEXT:    s_mov_b32 s11, -1
3523; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
3524; GFX6-NEXT:    s_mov_b32 s5, s12
3525; GFX6-NEXT:    s_mov_b32 s6, s11
3526; GFX6-NEXT:    s_mov_b32 s7, s10
3527; GFX6-NEXT:    v_mov_b32_e32 v0, s9
3528; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3529; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3530; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3531; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3532; GFX6-NEXT:    s_endpgm
3533;
3534; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3535; GFX7:       ; %bb.0: ; %entry
3536; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3537; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3538; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3539; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3540; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3541; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3542; GFX7-NEXT:    s_mov_b32 s4, s8
3543; GFX7-NEXT:    s_mov_b32 s5, s9
3544; GFX7-NEXT:    s_mov_b32 s9, s10
3545; GFX7-NEXT:    s_mov_b32 s8, s11
3546; GFX7-NEXT:    s_add_u32 s4, s4, s9
3547; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3548; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3549; GFX7-NEXT:    s_mov_b32 s5, s8
3550; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3551; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3552; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3553; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3554; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3555; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3556; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3557; GFX7-NEXT:    s_endpgm
3558;
3559; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3560; GFX10-WGP:       ; %bb.0: ; %entry
3561; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3562; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3563; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
3564; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
3565; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3566; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
3567; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
3568; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3569; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
3570; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3571; GFX10-WGP-NEXT:    s_endpgm
3572;
3573; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3574; GFX10-CU:       ; %bb.0: ; %entry
3575; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3576; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3577; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
3578; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
3579; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3580; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
3581; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
3582; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3583; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
3584; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3585; GFX10-CU-NEXT:    s_endpgm
3586;
3587; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3588; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3589; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
3590; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
3591; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
3592; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
3593; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3594; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
3595; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
3596; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
3597; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
3598; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
3599; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
3600; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
3601; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
3602; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
3603; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
3604; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3605; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
3606; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
3607; SKIP-CACHE-INV-NEXT:    s_endpgm
3608;
3609; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3610; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3611; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3612; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3613; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3614; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3615; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3616; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3617; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3618; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3619; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3620; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3621; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3622;
3623; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3624; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3625; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3626; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3627; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3628; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3629; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3631; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3632; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3633; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3634; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3635; GFX90A-TGSPLIT-NEXT:    s_endpgm
3636;
3637; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3638; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3639; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3640; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3641; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3642; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3643; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3644; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3645; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3646; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3647; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3648; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3649; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3650;
3651; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3652; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3653; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3654; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3655; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3656; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3657; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3658; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3659; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3660; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3661; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3662; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3663; GFX940-TGSPLIT-NEXT:    s_endpgm
3664;
3665; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3666; GFX11-WGP:       ; %bb.0: ; %entry
3667; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
3668; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3669; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3670; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3671; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3672; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
3673; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
3674; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3675; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
3676; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3677; GFX11-WGP-NEXT:    s_endpgm
3678;
3679; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3680; GFX11-CU:       ; %bb.0: ; %entry
3681; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
3682; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3683; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3684; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3685; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3686; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
3687; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
3688; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3689; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
3690; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3691; GFX11-CU-NEXT:    s_endpgm
3692;
3693; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3694; GFX12-WGP:       ; %bb.0: ; %entry
3695; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
3696; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3697; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3698; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3699; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3700; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
3701; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
3702; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3703; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
3704; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3705; GFX12-WGP-NEXT:    s_endpgm
3706;
3707; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
3708; GFX12-CU:       ; %bb.0: ; %entry
3709; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
3710; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3711; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3712; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3713; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3714; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
3715; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
3716; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3717; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
3718; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3719; GFX12-CU-NEXT:    s_endpgm
3720    ptr addrspace(1) %out, i32 %in, i32 %old) {
3721entry:
3722  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3723  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
3724  ret void
3725}
3726
3727define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
3728; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3729; GFX6:       ; %bb.0: ; %entry
3730; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
3731; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
3732; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
3733; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
3734; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3735; GFX6-NEXT:    s_mov_b32 s12, s5
3736; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
3737; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
3738; GFX6-NEXT:    s_mov_b32 s11, -1
3739; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
3740; GFX6-NEXT:    s_mov_b32 s5, s12
3741; GFX6-NEXT:    s_mov_b32 s6, s11
3742; GFX6-NEXT:    s_mov_b32 s7, s10
3743; GFX6-NEXT:    v_mov_b32_e32 v0, s9
3744; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3745; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3746; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3747; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3748; GFX6-NEXT:    s_endpgm
3749;
3750; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3751; GFX7:       ; %bb.0: ; %entry
3752; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3753; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3754; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3755; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3756; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3757; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3758; GFX7-NEXT:    s_mov_b32 s4, s8
3759; GFX7-NEXT:    s_mov_b32 s5, s9
3760; GFX7-NEXT:    s_mov_b32 s9, s10
3761; GFX7-NEXT:    s_mov_b32 s8, s11
3762; GFX7-NEXT:    s_add_u32 s4, s4, s9
3763; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3764; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3765; GFX7-NEXT:    s_mov_b32 s5, s8
3766; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3767; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3768; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3769; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3770; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3771; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3772; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3773; GFX7-NEXT:    s_endpgm
3774;
3775; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3776; GFX10-WGP:       ; %bb.0: ; %entry
3777; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3778; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3779; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
3780; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
3781; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3782; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
3783; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
3784; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3785; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
3786; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3787; GFX10-WGP-NEXT:    s_endpgm
3788;
3789; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3790; GFX10-CU:       ; %bb.0: ; %entry
3791; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3792; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3793; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
3794; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
3795; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
3797; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
3798; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3799; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
3800; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
3801; GFX10-CU-NEXT:    s_endpgm
3802;
3803; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3804; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3805; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
3806; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
3807; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
3808; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
3809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3810; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
3811; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
3812; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
3813; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
3814; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
3815; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
3816; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
3817; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
3818; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
3819; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
3820; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3821; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
3822; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
3823; SKIP-CACHE-INV-NEXT:    s_endpgm
3824;
3825; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3826; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
3827; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3828; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3829; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3830; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3831; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3832; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3833; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3834; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3835; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3836; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3837; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
3838;
3839; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3840; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
3841; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3842; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3843; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
3844; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
3845; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3846; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
3847; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
3848; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3849; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3850; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
3851; GFX90A-TGSPLIT-NEXT:    s_endpgm
3852;
3853; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3854; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
3855; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3856; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3857; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3858; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3859; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3860; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3861; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3862; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3863; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3864; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3865; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
3866;
3867; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3868; GFX940-TGSPLIT:       ; %bb.0: ; %entry
3869; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
3870; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3871; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
3872; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
3873; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
3874; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
3875; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
3876; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3877; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
3878; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
3879; GFX940-TGSPLIT-NEXT:    s_endpgm
3880;
3881; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3882; GFX11-WGP:       ; %bb.0: ; %entry
3883; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
3884; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3885; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3886; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3887; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3888; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
3889; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
3890; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3891; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
3892; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3893; GFX11-WGP-NEXT:    s_endpgm
3894;
3895; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3896; GFX11-CU:       ; %bb.0: ; %entry
3897; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
3898; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3899; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3900; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3901; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
3902; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
3903; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
3904; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3905; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
3906; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3907; GFX11-CU-NEXT:    s_endpgm
3908;
3909; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3910; GFX12-WGP:       ; %bb.0: ; %entry
3911; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
3912; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3913; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
3914; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
3915; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
3916; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
3917; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
3918; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3919; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
3920; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3921; GFX12-WGP-NEXT:    s_endpgm
3922;
3923; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
3924; GFX12-CU:       ; %bb.0: ; %entry
3925; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
3926; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
3927; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
3928; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
3929; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
3930; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
3931; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
3932; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
3933; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
3934; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
3935; GFX12-CU-NEXT:    s_endpgm
3936    ptr addrspace(1) %out, i32 %in, i32 %old) {
3937entry:
3938  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3939  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
3940  ret void
3941}
3942
3943define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
3944; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg:
3945; GFX6:       ; %bb.0: ; %entry
3946; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
3947; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
3948; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
3949; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
3950; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3951; GFX6-NEXT:    s_mov_b32 s12, s5
3952; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
3953; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
3954; GFX6-NEXT:    s_mov_b32 s11, -1
3955; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
3956; GFX6-NEXT:    s_mov_b32 s5, s12
3957; GFX6-NEXT:    s_mov_b32 s6, s11
3958; GFX6-NEXT:    s_mov_b32 s7, s10
3959; GFX6-NEXT:    v_mov_b32_e32 v0, s9
3960; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3961; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
3962; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3963; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3964; GFX6-NEXT:    s_endpgm
3965;
3966; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
3967; GFX7:       ; %bb.0: ; %entry
3968; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3969; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
3970; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
3971; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
3972; GFX7-NEXT:    s_mov_b64 s[10:11], 16
3973; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3974; GFX7-NEXT:    s_mov_b32 s4, s8
3975; GFX7-NEXT:    s_mov_b32 s5, s9
3976; GFX7-NEXT:    s_mov_b32 s9, s10
3977; GFX7-NEXT:    s_mov_b32 s8, s11
3978; GFX7-NEXT:    s_add_u32 s4, s4, s9
3979; GFX7-NEXT:    s_addc_u32 s8, s5, s8
3980; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3981; GFX7-NEXT:    s_mov_b32 s5, s8
3982; GFX7-NEXT:    v_mov_b32_e32 v2, s7
3983; GFX7-NEXT:    v_mov_b32_e32 v0, s6
3984; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3985; GFX7-NEXT:    v_mov_b32_e32 v3, v0
3986; GFX7-NEXT:    v_mov_b32_e32 v0, s4
3987; GFX7-NEXT:    v_mov_b32_e32 v1, s5
3988; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3989; GFX7-NEXT:    s_endpgm
3990;
3991; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
3992; GFX10-WGP:       ; %bb.0: ; %entry
3993; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3994; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3995; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
3996; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
3997; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3998; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
3999; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
4000; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4001; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
4002; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4003; GFX10-WGP-NEXT:    s_endpgm
4004;
4005; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4006; GFX10-CU:       ; %bb.0: ; %entry
4007; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4008; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4009; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
4010; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
4011; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4012; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
4013; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
4014; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4015; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
4016; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4017; GFX10-CU-NEXT:    s_endpgm
4018;
4019; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4020; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4021; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
4022; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
4023; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
4024; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
4025; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4026; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
4027; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
4028; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
4029; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
4030; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
4031; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
4032; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
4033; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
4034; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
4035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
4036; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4037; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
4038; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
4039; SKIP-CACHE-INV-NEXT:    s_endpgm
4040;
4041; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4042; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4043; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4044; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4045; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4046; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4047; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4048; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4049; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4050; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4051; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4052; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4053; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4054;
4055; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4056; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4057; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4058; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4059; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4060; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4061; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4063; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4064; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4065; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4066; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4067; GFX90A-TGSPLIT-NEXT:    s_endpgm
4068;
4069; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4070; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4071; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4072; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4073; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4074; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4075; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4076; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4077; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4078; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4079; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4080; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4081; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4082;
4083; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4084; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4085; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4086; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4087; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4088; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4089; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4090; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4091; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4092; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4093; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4094; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4095; GFX940-TGSPLIT-NEXT:    s_endpgm
4096;
4097; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4098; GFX11-WGP:       ; %bb.0: ; %entry
4099; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
4100; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4101; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4102; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4103; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4104; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
4105; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
4106; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4107; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
4108; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4109; GFX11-WGP-NEXT:    s_endpgm
4110;
4111; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4112; GFX11-CU:       ; %bb.0: ; %entry
4113; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
4114; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4115; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4116; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4117; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4118; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
4119; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
4120; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4121; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
4122; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4123; GFX11-CU-NEXT:    s_endpgm
4124;
4125; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4126; GFX12-WGP:       ; %bb.0: ; %entry
4127; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
4128; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4129; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4130; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4131; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4132; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
4133; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
4134; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4135; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
4136; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4137; GFX12-WGP-NEXT:    s_endpgm
4138;
4139; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
4140; GFX12-CU:       ; %bb.0: ; %entry
4141; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
4142; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4143; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4144; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4145; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4146; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
4147; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
4148; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4149; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
4150; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4151; GFX12-CU-NEXT:    s_endpgm
4152    ptr addrspace(1) %out, i32 %in, i32 %old) {
4153entry:
4154  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4155  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
4156  ret void
4157}
4158
4159define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
4160; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg:
4161; GFX6:       ; %bb.0: ; %entry
4162; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
4163; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
4164; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
4165; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
4166; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4167; GFX6-NEXT:    s_mov_b32 s12, s5
4168; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
4169; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
4170; GFX6-NEXT:    s_mov_b32 s11, -1
4171; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
4172; GFX6-NEXT:    s_mov_b32 s5, s12
4173; GFX6-NEXT:    s_mov_b32 s6, s11
4174; GFX6-NEXT:    s_mov_b32 s7, s10
4175; GFX6-NEXT:    v_mov_b32_e32 v0, s9
4176; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4177; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4178; GFX6-NEXT:    v_mov_b32_e32 v1, v2
4179; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4180; GFX6-NEXT:    s_endpgm
4181;
4182; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
4183; GFX7:       ; %bb.0: ; %entry
4184; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4185; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4186; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4187; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4188; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4189; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4190; GFX7-NEXT:    s_mov_b32 s4, s8
4191; GFX7-NEXT:    s_mov_b32 s5, s9
4192; GFX7-NEXT:    s_mov_b32 s9, s10
4193; GFX7-NEXT:    s_mov_b32 s8, s11
4194; GFX7-NEXT:    s_add_u32 s4, s4, s9
4195; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4196; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4197; GFX7-NEXT:    s_mov_b32 s5, s8
4198; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4199; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4200; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4201; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4202; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4203; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4204; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4205; GFX7-NEXT:    s_endpgm
4206;
4207; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
4208; GFX10-WGP:       ; %bb.0: ; %entry
4209; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4210; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4211; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
4212; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
4213; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4214; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
4215; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
4216; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4217; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
4218; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4219; GFX10-WGP-NEXT:    s_endpgm
4220;
4221; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
4222; GFX10-CU:       ; %bb.0: ; %entry
4223; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4224; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4225; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
4226; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
4227; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4228; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
4229; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
4230; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4231; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
4232; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4233; GFX10-CU-NEXT:    s_endpgm
4234;
4235; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg:
4236; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4237; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
4238; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
4239; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
4240; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
4241; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4242; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
4243; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
4244; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
4245; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
4246; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
4247; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
4248; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
4249; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
4250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
4251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
4252; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4253; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
4254; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
4255; SKIP-CACHE-INV-NEXT:    s_endpgm
4256;
4257; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
4258; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4259; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4260; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4261; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4262; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4263; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4264; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4265; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4266; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4267; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4268; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4269; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4270;
4271; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
4272; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4273; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4274; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4275; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4276; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4277; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4278; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4279; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4280; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4281; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4282; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4283; GFX90A-TGSPLIT-NEXT:    s_endpgm
4284;
4285; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
4286; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4287; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4288; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4289; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4290; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4291; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4292; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4293; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4294; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4295; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4296; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4297; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4298;
4299; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
4300; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4301; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4302; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4303; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4304; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4305; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4306; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4307; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4308; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4309; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4310; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4311; GFX940-TGSPLIT-NEXT:    s_endpgm
4312;
4313; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
4314; GFX11-WGP:       ; %bb.0: ; %entry
4315; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
4316; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4317; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4318; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4319; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4320; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
4321; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
4322; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4323; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
4324; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4325; GFX11-WGP-NEXT:    s_endpgm
4326;
4327; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
4328; GFX11-CU:       ; %bb.0: ; %entry
4329; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
4330; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4331; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4332; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4333; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4334; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
4335; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
4336; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4337; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
4338; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4339; GFX11-CU-NEXT:    s_endpgm
4340;
4341; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
4342; GFX12-WGP:       ; %bb.0: ; %entry
4343; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
4344; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4345; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4346; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4347; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4348; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
4349; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
4350; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4351; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
4352; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4353; GFX12-WGP-NEXT:    s_endpgm
4354;
4355; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
4356; GFX12-CU:       ; %bb.0: ; %entry
4357; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
4358; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4359; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4360; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4361; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4362; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
4363; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
4364; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4365; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
4366; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4367; GFX12-CU-NEXT:    s_endpgm
4368    ptr addrspace(1) %out, i32 %in, i32 %old) {
4369entry:
4370  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4371  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
4372  ret void
4373}
4374
4375define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
4376; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4377; GFX6:       ; %bb.0: ; %entry
4378; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
4379; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
4380; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
4381; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
4382; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4383; GFX6-NEXT:    s_mov_b32 s12, s5
4384; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
4385; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
4386; GFX6-NEXT:    s_mov_b32 s11, -1
4387; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
4388; GFX6-NEXT:    s_mov_b32 s5, s12
4389; GFX6-NEXT:    s_mov_b32 s6, s11
4390; GFX6-NEXT:    s_mov_b32 s7, s10
4391; GFX6-NEXT:    v_mov_b32_e32 v0, s9
4392; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4393; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4394; GFX6-NEXT:    v_mov_b32_e32 v1, v2
4395; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4396; GFX6-NEXT:    s_endpgm
4397;
4398; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4399; GFX7:       ; %bb.0: ; %entry
4400; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4401; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4402; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4403; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4404; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4405; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4406; GFX7-NEXT:    s_mov_b32 s4, s8
4407; GFX7-NEXT:    s_mov_b32 s5, s9
4408; GFX7-NEXT:    s_mov_b32 s9, s10
4409; GFX7-NEXT:    s_mov_b32 s8, s11
4410; GFX7-NEXT:    s_add_u32 s4, s4, s9
4411; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4412; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4413; GFX7-NEXT:    s_mov_b32 s5, s8
4414; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4415; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4416; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4417; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4418; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4419; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4420; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4421; GFX7-NEXT:    s_endpgm
4422;
4423; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4424; GFX10-WGP:       ; %bb.0: ; %entry
4425; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4426; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4427; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
4428; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
4429; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4430; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
4431; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
4432; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4433; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
4434; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4435; GFX10-WGP-NEXT:    s_endpgm
4436;
4437; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4438; GFX10-CU:       ; %bb.0: ; %entry
4439; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4440; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4441; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
4442; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
4443; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
4445; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
4446; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4447; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
4448; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4449; GFX10-CU-NEXT:    s_endpgm
4450;
4451; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4452; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4453; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
4454; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
4455; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
4456; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
4457; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4458; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
4459; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
4460; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
4461; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
4462; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
4463; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
4464; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
4465; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
4466; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
4467; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
4468; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4469; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
4470; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
4471; SKIP-CACHE-INV-NEXT:    s_endpgm
4472;
4473; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4474; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4475; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4476; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4477; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4478; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4479; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4480; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4481; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4482; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4483; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4484; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4485; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4486;
4487; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4488; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4489; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4490; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4491; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4492; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4493; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4494; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4495; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4496; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4497; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4498; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4499; GFX90A-TGSPLIT-NEXT:    s_endpgm
4500;
4501; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4502; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4503; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4504; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4505; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4506; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4507; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4508; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4509; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4510; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4511; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4512; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4513; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4514;
4515; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4516; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4517; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4518; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4519; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4520; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4521; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4522; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4523; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4524; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4525; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4526; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4527; GFX940-TGSPLIT-NEXT:    s_endpgm
4528;
4529; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4530; GFX11-WGP:       ; %bb.0: ; %entry
4531; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
4532; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4533; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4534; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4535; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4536; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
4537; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
4538; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4539; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
4540; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4541; GFX11-WGP-NEXT:    s_endpgm
4542;
4543; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4544; GFX11-CU:       ; %bb.0: ; %entry
4545; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
4546; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4547; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4548; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4549; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4550; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
4551; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
4552; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4553; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
4554; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4555; GFX11-CU-NEXT:    s_endpgm
4556;
4557; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4558; GFX12-WGP:       ; %bb.0: ; %entry
4559; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
4560; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4561; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4562; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4563; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4564; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
4565; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
4566; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4567; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
4568; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4569; GFX12-WGP-NEXT:    s_endpgm
4570;
4571; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
4572; GFX12-CU:       ; %bb.0: ; %entry
4573; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
4574; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4575; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4576; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4577; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4578; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
4579; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
4580; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4581; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
4582; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4583; GFX12-CU-NEXT:    s_endpgm
4584    ptr addrspace(1) %out, i32 %in, i32 %old) {
4585entry:
4586  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4587  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
4588  ret void
4589}
4590
4591define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
4592; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4593; GFX6:       ; %bb.0: ; %entry
4594; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
4595; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
4596; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
4597; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
4598; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4599; GFX6-NEXT:    s_mov_b32 s12, s5
4600; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
4601; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
4602; GFX6-NEXT:    s_mov_b32 s11, -1
4603; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
4604; GFX6-NEXT:    s_mov_b32 s5, s12
4605; GFX6-NEXT:    s_mov_b32 s6, s11
4606; GFX6-NEXT:    s_mov_b32 s7, s10
4607; GFX6-NEXT:    v_mov_b32_e32 v0, s9
4608; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4609; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4610; GFX6-NEXT:    v_mov_b32_e32 v1, v2
4611; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4612; GFX6-NEXT:    s_endpgm
4613;
4614; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4615; GFX7:       ; %bb.0: ; %entry
4616; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4617; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4618; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4619; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4620; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4621; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4622; GFX7-NEXT:    s_mov_b32 s4, s8
4623; GFX7-NEXT:    s_mov_b32 s5, s9
4624; GFX7-NEXT:    s_mov_b32 s9, s10
4625; GFX7-NEXT:    s_mov_b32 s8, s11
4626; GFX7-NEXT:    s_add_u32 s4, s4, s9
4627; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4628; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4629; GFX7-NEXT:    s_mov_b32 s5, s8
4630; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4631; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4632; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4633; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4634; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4635; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4636; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4637; GFX7-NEXT:    s_endpgm
4638;
4639; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4640; GFX10-WGP:       ; %bb.0: ; %entry
4641; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4642; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4643; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
4644; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
4645; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4646; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
4647; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
4648; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4649; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
4650; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4651; GFX10-WGP-NEXT:    s_endpgm
4652;
4653; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4654; GFX10-CU:       ; %bb.0: ; %entry
4655; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4656; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4657; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
4658; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
4659; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4660; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
4661; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
4662; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4663; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
4664; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4665; GFX10-CU-NEXT:    s_endpgm
4666;
4667; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4668; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4669; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
4670; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
4671; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
4672; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
4673; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4674; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
4675; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
4676; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
4677; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
4678; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
4679; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
4680; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
4681; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
4682; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
4683; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
4684; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
4686; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
4687; SKIP-CACHE-INV-NEXT:    s_endpgm
4688;
4689; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4690; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4691; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4692; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4693; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4694; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4695; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4696; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4697; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4698; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4699; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4700; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4701; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4702;
4703; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4704; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4705; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4706; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4707; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4708; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4709; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4710; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4711; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4712; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4713; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4714; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4715; GFX90A-TGSPLIT-NEXT:    s_endpgm
4716;
4717; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4718; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4719; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4720; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4721; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4722; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4723; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4724; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4725; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4726; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4727; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4728; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4729; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4730;
4731; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4732; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4733; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4734; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4735; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4736; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4737; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4738; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4739; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4740; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4741; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4742; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4743; GFX940-TGSPLIT-NEXT:    s_endpgm
4744;
4745; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4746; GFX11-WGP:       ; %bb.0: ; %entry
4747; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
4748; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4749; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4750; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4751; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4752; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
4753; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
4754; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4755; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
4756; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4757; GFX11-WGP-NEXT:    s_endpgm
4758;
4759; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4760; GFX11-CU:       ; %bb.0: ; %entry
4761; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
4762; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4763; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4764; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4765; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4766; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
4767; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
4768; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4769; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
4770; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4771; GFX11-CU-NEXT:    s_endpgm
4772;
4773; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4774; GFX12-WGP:       ; %bb.0: ; %entry
4775; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
4776; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4777; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4778; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4779; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4780; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
4781; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
4782; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4783; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
4784; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4785; GFX12-WGP-NEXT:    s_endpgm
4786;
4787; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
4788; GFX12-CU:       ; %bb.0: ; %entry
4789; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
4790; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4791; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4792; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4793; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
4794; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
4795; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
4796; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4797; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
4798; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4799; GFX12-CU-NEXT:    s_endpgm
4800    ptr addrspace(1) %out, i32 %in, i32 %old) {
4801entry:
4802  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4803  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
4804  ret void
4805}
4806
4807define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
4808; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4809; GFX6:       ; %bb.0: ; %entry
4810; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
4811; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
4812; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
4813; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
4814; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4815; GFX6-NEXT:    s_mov_b32 s12, s5
4816; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
4817; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
4818; GFX6-NEXT:    s_mov_b32 s11, -1
4819; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
4820; GFX6-NEXT:    s_mov_b32 s5, s12
4821; GFX6-NEXT:    s_mov_b32 s6, s11
4822; GFX6-NEXT:    s_mov_b32 s7, s10
4823; GFX6-NEXT:    v_mov_b32_e32 v0, s9
4824; GFX6-NEXT:    v_mov_b32_e32 v2, s8
4825; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4826; GFX6-NEXT:    v_mov_b32_e32 v1, v2
4827; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4828; GFX6-NEXT:    s_endpgm
4829;
4830; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4831; GFX7:       ; %bb.0: ; %entry
4832; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4833; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
4834; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
4835; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
4836; GFX7-NEXT:    s_mov_b64 s[10:11], 16
4837; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX7-NEXT:    s_mov_b32 s4, s8
4839; GFX7-NEXT:    s_mov_b32 s5, s9
4840; GFX7-NEXT:    s_mov_b32 s9, s10
4841; GFX7-NEXT:    s_mov_b32 s8, s11
4842; GFX7-NEXT:    s_add_u32 s4, s4, s9
4843; GFX7-NEXT:    s_addc_u32 s8, s5, s8
4844; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4845; GFX7-NEXT:    s_mov_b32 s5, s8
4846; GFX7-NEXT:    v_mov_b32_e32 v2, s7
4847; GFX7-NEXT:    v_mov_b32_e32 v0, s6
4848; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4849; GFX7-NEXT:    v_mov_b32_e32 v3, v0
4850; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4851; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4852; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4853; GFX7-NEXT:    s_endpgm
4854;
4855; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4856; GFX10-WGP:       ; %bb.0: ; %entry
4857; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
4858; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4859; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
4860; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
4861; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4862; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
4863; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
4864; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4865; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
4866; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4867; GFX10-WGP-NEXT:    s_endpgm
4868;
4869; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4870; GFX10-CU:       ; %bb.0: ; %entry
4871; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
4872; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4873; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
4874; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
4875; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4876; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
4877; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
4878; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4879; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
4880; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
4881; GFX10-CU-NEXT:    s_endpgm
4882;
4883; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4884; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4885; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
4886; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
4887; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
4888; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
4889; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4890; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
4891; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
4892; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
4893; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
4894; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
4895; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
4896; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
4897; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
4898; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
4899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
4900; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
4901; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
4902; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
4903; SKIP-CACHE-INV-NEXT:    s_endpgm
4904;
4905; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4906; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
4907; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4908; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4909; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4910; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4911; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4912; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4913; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4914; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4915; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4916; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4917; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
4918;
4919; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4920; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
4921; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4922; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
4923; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
4924; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
4925; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4926; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
4927; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
4928; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4929; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4930; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
4931; GFX90A-TGSPLIT-NEXT:    s_endpgm
4932;
4933; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4934; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
4935; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4936; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4937; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4938; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4939; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4940; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4941; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4942; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4943; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4944; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4945; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
4946;
4947; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4948; GFX940-TGSPLIT:       ; %bb.0: ; %entry
4949; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
4950; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4951; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
4952; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
4953; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
4954; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
4955; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
4956; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4957; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
4958; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
4959; GFX940-TGSPLIT-NEXT:    s_endpgm
4960;
4961; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4962; GFX11-WGP:       ; %bb.0: ; %entry
4963; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
4964; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4965; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4966; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4967; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4968; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
4969; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
4970; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4971; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
4972; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4973; GFX11-WGP-NEXT:    s_endpgm
4974;
4975; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4976; GFX11-CU:       ; %bb.0: ; %entry
4977; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
4978; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4979; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
4980; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
4981; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
4982; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
4983; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
4984; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4985; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
4986; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
4987; GFX11-CU-NEXT:    s_endpgm
4988;
4989; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
4990; GFX12-WGP:       ; %bb.0: ; %entry
4991; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
4992; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
4993; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
4994; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
4995; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
4996; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
4997; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
4998; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
4999; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
5000; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5001; GFX12-WGP-NEXT:    s_endpgm
5002;
5003; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
5004; GFX12-CU:       ; %bb.0: ; %entry
5005; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
5006; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5007; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5008; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5009; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5010; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
5011; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
5012; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5013; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
5014; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5015; GFX12-CU-NEXT:    s_endpgm
5016    ptr addrspace(1) %out, i32 %in, i32 %old) {
5017entry:
5018  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5019  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
5020  ret void
5021}
5022
5023define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
5024; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5025; GFX6:       ; %bb.0: ; %entry
5026; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
5027; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5028; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
5029; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
5030; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5031; GFX6-NEXT:    s_mov_b32 s12, s5
5032; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
5033; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
5034; GFX6-NEXT:    s_mov_b32 s11, -1
5035; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
5036; GFX6-NEXT:    s_mov_b32 s5, s12
5037; GFX6-NEXT:    s_mov_b32 s6, s11
5038; GFX6-NEXT:    s_mov_b32 s7, s10
5039; GFX6-NEXT:    v_mov_b32_e32 v0, s9
5040; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5041; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5042; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5043; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5044; GFX6-NEXT:    s_endpgm
5045;
5046; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5047; GFX7:       ; %bb.0: ; %entry
5048; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5049; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5050; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5051; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5052; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5053; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX7-NEXT:    s_mov_b32 s4, s8
5055; GFX7-NEXT:    s_mov_b32 s5, s9
5056; GFX7-NEXT:    s_mov_b32 s9, s10
5057; GFX7-NEXT:    s_mov_b32 s8, s11
5058; GFX7-NEXT:    s_add_u32 s4, s4, s9
5059; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5060; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5061; GFX7-NEXT:    s_mov_b32 s5, s8
5062; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5063; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5064; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5065; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5066; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5067; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5068; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5069; GFX7-NEXT:    s_endpgm
5070;
5071; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5072; GFX10-WGP:       ; %bb.0: ; %entry
5073; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5074; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5075; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
5076; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
5077; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5078; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5079; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
5080; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5081; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
5082; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5083; GFX10-WGP-NEXT:    s_endpgm
5084;
5085; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5086; GFX10-CU:       ; %bb.0: ; %entry
5087; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5088; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5089; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
5090; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
5091; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5092; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5093; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
5094; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5095; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
5096; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5097; GFX10-CU-NEXT:    s_endpgm
5098;
5099; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5100; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5101; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5102; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5103; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5104; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5105; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5106; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
5107; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
5108; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
5109; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
5110; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
5111; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
5112; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
5113; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5114; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
5115; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
5116; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5117; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
5118; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5119; SKIP-CACHE-INV-NEXT:    s_endpgm
5120;
5121; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5122; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5123; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5124; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5125; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5126; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5127; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5128; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5129; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5130; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5131; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5132; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5133; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5134;
5135; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5136; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5137; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5138; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5139; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5140; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5141; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5142; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5143; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5144; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5145; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5146; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5147; GFX90A-TGSPLIT-NEXT:    s_endpgm
5148;
5149; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5150; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5151; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5152; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5153; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5154; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5155; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5156; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5157; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5158; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5159; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5160; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5161; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5162;
5163; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5164; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5165; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5166; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5167; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5168; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5169; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5170; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5171; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5172; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5173; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5174; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5175; GFX940-TGSPLIT-NEXT:    s_endpgm
5176;
5177; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5178; GFX11-WGP:       ; %bb.0: ; %entry
5179; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
5180; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5181; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5182; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5183; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5184; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
5185; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
5186; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5187; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
5188; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5189; GFX11-WGP-NEXT:    s_endpgm
5190;
5191; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5192; GFX11-CU:       ; %bb.0: ; %entry
5193; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
5194; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5195; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5196; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5197; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5198; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
5199; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
5200; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5201; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
5202; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5203; GFX11-CU-NEXT:    s_endpgm
5204;
5205; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5206; GFX12-WGP:       ; %bb.0: ; %entry
5207; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
5208; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5209; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5210; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5211; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5212; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
5213; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
5214; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5215; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
5216; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5217; GFX12-WGP-NEXT:    s_endpgm
5218;
5219; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
5220; GFX12-CU:       ; %bb.0: ; %entry
5221; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
5222; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5223; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5224; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5225; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5226; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
5227; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
5228; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5229; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
5230; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5231; GFX12-CU-NEXT:    s_endpgm
5232    ptr addrspace(1) %out, i32 %in, i32 %old) {
5233entry:
5234  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5235  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
5236  ret void
5237}
5238
5239define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
5240; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5241; GFX6:       ; %bb.0: ; %entry
5242; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
5243; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5244; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
5245; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
5246; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5247; GFX6-NEXT:    s_mov_b32 s12, s5
5248; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
5249; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
5250; GFX6-NEXT:    s_mov_b32 s11, -1
5251; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
5252; GFX6-NEXT:    s_mov_b32 s5, s12
5253; GFX6-NEXT:    s_mov_b32 s6, s11
5254; GFX6-NEXT:    s_mov_b32 s7, s10
5255; GFX6-NEXT:    v_mov_b32_e32 v0, s9
5256; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5257; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5258; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5259; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5260; GFX6-NEXT:    s_endpgm
5261;
5262; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5263; GFX7:       ; %bb.0: ; %entry
5264; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5265; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5266; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5267; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5268; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5269; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5270; GFX7-NEXT:    s_mov_b32 s4, s8
5271; GFX7-NEXT:    s_mov_b32 s5, s9
5272; GFX7-NEXT:    s_mov_b32 s9, s10
5273; GFX7-NEXT:    s_mov_b32 s8, s11
5274; GFX7-NEXT:    s_add_u32 s4, s4, s9
5275; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5276; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5277; GFX7-NEXT:    s_mov_b32 s5, s8
5278; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5279; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5280; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5281; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5282; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5283; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5284; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5285; GFX7-NEXT:    s_endpgm
5286;
5287; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5288; GFX10-WGP:       ; %bb.0: ; %entry
5289; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5290; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5291; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
5292; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
5293; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5294; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5295; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
5296; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5297; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
5298; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5299; GFX10-WGP-NEXT:    s_endpgm
5300;
5301; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5302; GFX10-CU:       ; %bb.0: ; %entry
5303; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5304; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5305; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
5306; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
5307; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5308; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5309; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
5310; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5311; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
5312; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5313; GFX10-CU-NEXT:    s_endpgm
5314;
5315; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5316; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5317; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5318; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5319; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5320; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5321; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5322; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
5323; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
5324; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
5325; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
5326; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
5327; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
5328; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
5329; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
5331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
5332; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5333; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
5334; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5335; SKIP-CACHE-INV-NEXT:    s_endpgm
5336;
5337; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5338; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5339; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5340; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5341; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5342; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5343; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5345; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5346; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5347; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5348; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5349; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5350;
5351; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5352; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5353; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5354; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5355; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5356; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5357; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5358; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5359; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5360; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5361; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5362; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5363; GFX90A-TGSPLIT-NEXT:    s_endpgm
5364;
5365; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5366; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5367; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5368; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5369; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5370; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5371; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5372; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5373; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5374; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5375; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5376; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5377; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5378;
5379; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5380; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5381; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5382; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5383; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5384; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5385; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5386; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5387; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5388; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5389; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5390; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5391; GFX940-TGSPLIT-NEXT:    s_endpgm
5392;
5393; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5394; GFX11-WGP:       ; %bb.0: ; %entry
5395; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
5396; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5397; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5398; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5399; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5400; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
5401; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
5402; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5403; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
5404; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5405; GFX11-WGP-NEXT:    s_endpgm
5406;
5407; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5408; GFX11-CU:       ; %bb.0: ; %entry
5409; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
5410; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5411; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5412; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5413; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5414; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
5415; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
5416; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5417; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
5418; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5419; GFX11-CU-NEXT:    s_endpgm
5420;
5421; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5422; GFX12-WGP:       ; %bb.0: ; %entry
5423; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
5424; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5425; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5426; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5427; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5428; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
5429; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
5430; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5431; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
5432; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5433; GFX12-WGP-NEXT:    s_endpgm
5434;
5435; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
5436; GFX12-CU:       ; %bb.0: ; %entry
5437; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
5438; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5439; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5440; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5441; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5442; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
5443; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
5444; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5445; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
5446; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5447; GFX12-CU-NEXT:    s_endpgm
5448    ptr addrspace(1) %out, i32 %in, i32 %old) {
5449entry:
5450  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5451  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
5452  ret void
5453}
5454
5455define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
5456; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5457; GFX6:       ; %bb.0: ; %entry
5458; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
5459; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5460; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
5461; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
5462; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5463; GFX6-NEXT:    s_mov_b32 s12, s5
5464; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
5465; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
5466; GFX6-NEXT:    s_mov_b32 s11, -1
5467; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
5468; GFX6-NEXT:    s_mov_b32 s5, s12
5469; GFX6-NEXT:    s_mov_b32 s6, s11
5470; GFX6-NEXT:    s_mov_b32 s7, s10
5471; GFX6-NEXT:    v_mov_b32_e32 v0, s9
5472; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5473; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5474; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5475; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5476; GFX6-NEXT:    s_endpgm
5477;
5478; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5479; GFX7:       ; %bb.0: ; %entry
5480; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5481; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5482; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5483; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5484; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5486; GFX7-NEXT:    s_mov_b32 s4, s8
5487; GFX7-NEXT:    s_mov_b32 s5, s9
5488; GFX7-NEXT:    s_mov_b32 s9, s10
5489; GFX7-NEXT:    s_mov_b32 s8, s11
5490; GFX7-NEXT:    s_add_u32 s4, s4, s9
5491; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5492; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5493; GFX7-NEXT:    s_mov_b32 s5, s8
5494; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5495; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5496; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5497; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5498; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5499; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5500; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5501; GFX7-NEXT:    s_endpgm
5502;
5503; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5504; GFX10-WGP:       ; %bb.0: ; %entry
5505; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5506; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5507; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
5508; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
5509; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5510; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5511; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
5512; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5513; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
5514; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5515; GFX10-WGP-NEXT:    s_endpgm
5516;
5517; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5518; GFX10-CU:       ; %bb.0: ; %entry
5519; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5520; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5521; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
5522; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
5523; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5525; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
5526; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5527; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
5528; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5529; GFX10-CU-NEXT:    s_endpgm
5530;
5531; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5532; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5533; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5534; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5535; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5536; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5537; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5538; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
5539; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
5540; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
5541; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
5542; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
5543; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
5544; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
5545; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5546; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
5547; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
5548; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5549; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
5550; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5551; SKIP-CACHE-INV-NEXT:    s_endpgm
5552;
5553; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5554; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5555; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5556; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5557; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5558; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5559; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5560; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5561; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5562; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5563; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5564; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5565; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5566;
5567; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5568; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5569; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5570; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5571; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5572; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5573; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5574; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5575; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5576; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5577; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5578; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5579; GFX90A-TGSPLIT-NEXT:    s_endpgm
5580;
5581; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5582; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5583; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5584; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5585; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5586; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5587; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5588; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5589; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5590; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5591; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5592; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5593; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5594;
5595; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5596; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5597; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5598; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5599; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5600; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5601; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5602; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5603; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5604; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5605; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5606; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5607; GFX940-TGSPLIT-NEXT:    s_endpgm
5608;
5609; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5610; GFX11-WGP:       ; %bb.0: ; %entry
5611; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
5612; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5613; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5614; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5615; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5616; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
5617; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
5618; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5619; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
5620; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5621; GFX11-WGP-NEXT:    s_endpgm
5622;
5623; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5624; GFX11-CU:       ; %bb.0: ; %entry
5625; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
5626; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5627; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5628; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5629; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5630; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
5631; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
5632; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5633; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
5634; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5635; GFX11-CU-NEXT:    s_endpgm
5636;
5637; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5638; GFX12-WGP:       ; %bb.0: ; %entry
5639; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
5640; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5641; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5642; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5643; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5644; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
5645; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
5646; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5647; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
5648; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5649; GFX12-WGP-NEXT:    s_endpgm
5650;
5651; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
5652; GFX12-CU:       ; %bb.0: ; %entry
5653; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
5654; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5655; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5656; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5657; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5658; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
5659; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
5660; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5661; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
5662; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5663; GFX12-CU-NEXT:    s_endpgm
5664    ptr addrspace(1) %out, i32 %in, i32 %old) {
5665entry:
5666  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5667  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
5668  ret void
5669}
5670
5671define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
5672; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5673; GFX6:       ; %bb.0: ; %entry
5674; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
5675; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5676; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
5677; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
5678; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5679; GFX6-NEXT:    s_mov_b32 s12, s5
5680; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
5681; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
5682; GFX6-NEXT:    s_mov_b32 s11, -1
5683; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
5684; GFX6-NEXT:    s_mov_b32 s5, s12
5685; GFX6-NEXT:    s_mov_b32 s6, s11
5686; GFX6-NEXT:    s_mov_b32 s7, s10
5687; GFX6-NEXT:    v_mov_b32_e32 v0, s9
5688; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5689; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5690; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5691; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
5692; GFX6-NEXT:    s_endpgm
5693;
5694; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5695; GFX7:       ; %bb.0: ; %entry
5696; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5697; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5698; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
5699; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
5700; GFX7-NEXT:    s_mov_b64 s[10:11], 16
5701; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5702; GFX7-NEXT:    s_mov_b32 s4, s8
5703; GFX7-NEXT:    s_mov_b32 s5, s9
5704; GFX7-NEXT:    s_mov_b32 s9, s10
5705; GFX7-NEXT:    s_mov_b32 s8, s11
5706; GFX7-NEXT:    s_add_u32 s4, s4, s9
5707; GFX7-NEXT:    s_addc_u32 s8, s5, s8
5708; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5709; GFX7-NEXT:    s_mov_b32 s5, s8
5710; GFX7-NEXT:    v_mov_b32_e32 v2, s7
5711; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5712; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5713; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5714; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5715; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5716; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
5717; GFX7-NEXT:    s_endpgm
5718;
5719; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5720; GFX10-WGP:       ; %bb.0: ; %entry
5721; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5722; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5723; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
5724; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
5725; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5726; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5727; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
5728; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5729; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
5730; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5731; GFX10-WGP-NEXT:    s_endpgm
5732;
5733; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5734; GFX10-CU:       ; %bb.0: ; %entry
5735; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5736; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5737; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
5738; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
5739; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5740; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5741; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
5742; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5743; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
5744; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
5745; GFX10-CU-NEXT:    s_endpgm
5746;
5747; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5748; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5749; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5750; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5751; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5752; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5753; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5754; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
5755; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
5756; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
5757; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
5758; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
5759; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
5760; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
5761; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
5763; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
5764; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5765; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
5766; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
5767; SKIP-CACHE-INV-NEXT:    s_endpgm
5768;
5769; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5770; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
5771; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5772; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5774; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5775; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5776; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5777; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5778; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5779; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5780; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5781; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
5782;
5783; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5784; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
5785; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5786; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5787; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
5788; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
5789; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5790; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
5791; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
5792; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5793; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5794; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
5795; GFX90A-TGSPLIT-NEXT:    s_endpgm
5796;
5797; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5798; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
5799; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5800; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5801; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5802; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5803; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5804; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5805; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5806; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5807; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5808; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5809; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
5810;
5811; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5812; GFX940-TGSPLIT:       ; %bb.0: ; %entry
5813; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
5814; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5815; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
5816; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
5817; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
5818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
5819; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
5820; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5821; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
5822; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
5823; GFX940-TGSPLIT-NEXT:    s_endpgm
5824;
5825; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5826; GFX11-WGP:       ; %bb.0: ; %entry
5827; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
5828; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5829; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5830; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5831; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5832; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
5833; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
5834; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5835; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
5836; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5837; GFX11-WGP-NEXT:    s_endpgm
5838;
5839; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5840; GFX11-CU:       ; %bb.0: ; %entry
5841; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
5842; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5843; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5844; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5845; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
5846; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
5847; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
5848; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5849; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
5850; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5851; GFX11-CU-NEXT:    s_endpgm
5852;
5853; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5854; GFX12-WGP:       ; %bb.0: ; %entry
5855; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
5856; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5857; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
5858; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
5859; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
5860; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
5861; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
5862; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5863; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
5864; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5865; GFX12-WGP-NEXT:    s_endpgm
5866;
5867; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
5868; GFX12-CU:       ; %bb.0: ; %entry
5869; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
5870; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
5871; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
5872; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
5873; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
5874; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
5875; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
5876; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5877; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
5878; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
5879; GFX12-CU-NEXT:    s_endpgm
5880    ptr addrspace(1) %out, i32 %in, i32 %old) {
5881entry:
5882  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5883  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
5884  ret void
5885}
5886
5887define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
5888; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
5889; GFX6:       ; %bb.0: ; %entry
5890; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
5891; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5892; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
5893; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
5894; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5895; GFX6-NEXT:    s_mov_b32 s12, s5
5896; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
5897; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
5898; GFX6-NEXT:    s_mov_b32 s11, -1
5899; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
5900; GFX6-NEXT:    s_mov_b32 s5, s12
5901; GFX6-NEXT:    s_mov_b32 s6, s11
5902; GFX6-NEXT:    s_mov_b32 s7, s10
5903; GFX6-NEXT:    v_mov_b32_e32 v0, s9
5904; GFX6-NEXT:    v_mov_b32_e32 v2, s8
5905; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5906; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5907; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5908; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
5909; GFX6-NEXT:    s_waitcnt vmcnt(0)
5910; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5911; GFX6-NEXT:    s_endpgm
5912;
5913; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
5914; GFX7:       ; %bb.0: ; %entry
5915; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
5916; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
5917; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
5918; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
5919; GFX7-NEXT:    s_mov_b64 s[12:13], 16
5920; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5921; GFX7-NEXT:    s_mov_b32 s6, s4
5922; GFX7-NEXT:    s_mov_b32 s7, s5
5923; GFX7-NEXT:    s_mov_b32 s11, s12
5924; GFX7-NEXT:    s_mov_b32 s10, s13
5925; GFX7-NEXT:    s_add_u32 s6, s6, s11
5926; GFX7-NEXT:    s_addc_u32 s10, s7, s10
5927; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5928; GFX7-NEXT:    s_mov_b32 s7, s10
5929; GFX7-NEXT:    v_mov_b32_e32 v2, s9
5930; GFX7-NEXT:    v_mov_b32_e32 v0, s8
5931; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5932; GFX7-NEXT:    v_mov_b32_e32 v3, v0
5933; GFX7-NEXT:    v_mov_b32_e32 v0, s6
5934; GFX7-NEXT:    v_mov_b32_e32 v1, s7
5935; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5936; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5937; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5938; GFX7-NEXT:    s_waitcnt vmcnt(0)
5939; GFX7-NEXT:    flat_store_dword v[0:1], v2
5940; GFX7-NEXT:    s_endpgm
5941;
5942; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
5943; GFX10-WGP:       ; %bb.0: ; %entry
5944; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
5945; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5946; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
5947; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
5948; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5949; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
5950; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
5951; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5952; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
5953; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
5954; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5955; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
5956; GFX10-WGP-NEXT:    s_endpgm
5957;
5958; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
5959; GFX10-CU:       ; %bb.0: ; %entry
5960; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
5961; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
5962; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
5963; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
5964; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5965; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
5966; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
5967; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
5968; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
5969; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
5970; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5971; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
5972; GFX10-CU-NEXT:    s_endpgm
5973;
5974; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
5975; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5976; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
5977; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
5978; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
5979; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
5980; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5981; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
5982; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
5983; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
5984; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
5985; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
5986; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
5987; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
5988; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
5989; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
5990; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
5991; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
5992; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
5993; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
5994; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
5995; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5996; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5997; SKIP-CACHE-INV-NEXT:    s_endpgm
5998;
5999; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6000; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6001; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6002; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6003; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6004; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6005; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6006; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6007; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6008; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6009; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6010; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6011; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6012; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6013; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6014;
6015; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6016; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6017; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6018; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6019; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6020; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6021; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6022; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6023; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6024; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6025; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6026; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6027; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6028; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6029; GFX90A-TGSPLIT-NEXT:    s_endpgm
6030;
6031; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6032; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6033; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6034; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6035; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6036; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6037; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6038; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6039; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6040; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6041; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6042; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6043; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6044; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6045; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6046;
6047; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6048; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6049; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6050; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6051; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6052; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6053; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6054; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6055; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6056; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6057; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6058; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6059; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6060; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6061; GFX940-TGSPLIT-NEXT:    s_endpgm
6062;
6063; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6064; GFX11-WGP:       ; %bb.0: ; %entry
6065; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
6066; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6067; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6068; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6069; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6070; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
6071; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
6072; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6073; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
6074; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6075; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
6076; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6077; GFX11-WGP-NEXT:    s_endpgm
6078;
6079; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6080; GFX11-CU:       ; %bb.0: ; %entry
6081; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
6082; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6083; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6084; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6085; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6086; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
6087; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
6088; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6089; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
6090; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6091; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6092; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6093; GFX11-CU-NEXT:    s_endpgm
6094;
6095; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6096; GFX12-WGP:       ; %bb.0: ; %entry
6097; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
6098; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6099; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6100; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6101; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6102; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
6103; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
6104; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6105; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
6106; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6107; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
6108; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6109; GFX12-WGP-NEXT:    s_endpgm
6110;
6111; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
6112; GFX12-CU:       ; %bb.0: ; %entry
6113; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
6114; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6115; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6116; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6117; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6118; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
6119; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
6120; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6121; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
6122; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6123; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6124; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6125; GFX12-CU-NEXT:    s_endpgm
6126   ptr addrspace(1) %out, i32 %in, i32 %old) {
6127entry:
6128  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6129  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
6130  %val0 = extractvalue { i32, i1 } %val, 0
6131  store i32 %val0, ptr addrspace(1) %out, align 4
6132  ret void
6133}
6134
6135define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
6136; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6137; GFX6:       ; %bb.0: ; %entry
6138; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
6139; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6140; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
6141; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
6142; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6143; GFX6-NEXT:    s_mov_b32 s12, s5
6144; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
6145; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
6146; GFX6-NEXT:    s_mov_b32 s11, -1
6147; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
6148; GFX6-NEXT:    s_mov_b32 s5, s12
6149; GFX6-NEXT:    s_mov_b32 s6, s11
6150; GFX6-NEXT:    s_mov_b32 s7, s10
6151; GFX6-NEXT:    v_mov_b32_e32 v0, s9
6152; GFX6-NEXT:    v_mov_b32_e32 v2, s8
6153; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6154; GFX6-NEXT:    v_mov_b32_e32 v1, v2
6155; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6156; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6157; GFX6-NEXT:    s_waitcnt vmcnt(0)
6158; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6159; GFX6-NEXT:    s_endpgm
6160;
6161; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6162; GFX7:       ; %bb.0: ; %entry
6163; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6164; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6165; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6166; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6167; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6168; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6169; GFX7-NEXT:    s_mov_b32 s6, s4
6170; GFX7-NEXT:    s_mov_b32 s7, s5
6171; GFX7-NEXT:    s_mov_b32 s11, s12
6172; GFX7-NEXT:    s_mov_b32 s10, s13
6173; GFX7-NEXT:    s_add_u32 s6, s6, s11
6174; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6175; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6176; GFX7-NEXT:    s_mov_b32 s7, s10
6177; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6178; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6179; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6180; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6181; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6182; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6183; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6184; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6185; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6186; GFX7-NEXT:    s_waitcnt vmcnt(0)
6187; GFX7-NEXT:    flat_store_dword v[0:1], v2
6188; GFX7-NEXT:    s_endpgm
6189;
6190; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6191; GFX10-WGP:       ; %bb.0: ; %entry
6192; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
6193; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6194; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
6195; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
6196; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6197; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6198; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
6199; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6200; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
6201; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6202; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6203; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
6204; GFX10-WGP-NEXT:    s_endpgm
6205;
6206; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6207; GFX10-CU:       ; %bb.0: ; %entry
6208; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
6209; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6210; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
6211; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
6212; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6213; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6214; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
6215; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6216; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
6217; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6218; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6219; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
6220; GFX10-CU-NEXT:    s_endpgm
6221;
6222; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6223; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6224; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6225; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6226; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6227; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6228; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6229; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
6230; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
6231; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
6232; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
6233; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
6234; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
6235; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
6236; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6237; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
6238; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
6239; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6240; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
6241; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6242; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6243; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6244; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6245; SKIP-CACHE-INV-NEXT:    s_endpgm
6246;
6247; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6248; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6249; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6250; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6251; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6252; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6253; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6254; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6255; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6256; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6257; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6258; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6259; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6260; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6261; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6262;
6263; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6264; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6265; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6266; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6267; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6268; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6269; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6270; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6271; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6272; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6273; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6274; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6275; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6276; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6277; GFX90A-TGSPLIT-NEXT:    s_endpgm
6278;
6279; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6280; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6281; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6282; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6283; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6284; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6285; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6286; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6287; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6288; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6289; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6290; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6291; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6292; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6293; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6294;
6295; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6296; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6297; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6298; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6299; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6300; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6301; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6302; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6303; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6304; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6305; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6306; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6307; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6308; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6309; GFX940-TGSPLIT-NEXT:    s_endpgm
6310;
6311; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6312; GFX11-WGP:       ; %bb.0: ; %entry
6313; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
6314; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6315; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6316; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6317; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6318; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
6319; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
6320; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6321; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
6322; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6323; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
6324; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6325; GFX11-WGP-NEXT:    s_endpgm
6326;
6327; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6328; GFX11-CU:       ; %bb.0: ; %entry
6329; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
6330; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6331; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6332; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6333; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6334; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
6335; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
6336; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6337; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
6338; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6339; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6340; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6341; GFX11-CU-NEXT:    s_endpgm
6342;
6343; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6344; GFX12-WGP:       ; %bb.0: ; %entry
6345; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
6346; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6347; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6348; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6349; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6350; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
6351; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
6352; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6353; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
6354; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6355; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
6356; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6357; GFX12-WGP-NEXT:    s_endpgm
6358;
6359; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
6360; GFX12-CU:       ; %bb.0: ; %entry
6361; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
6362; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6363; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6364; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6365; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6366; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
6367; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
6368; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6369; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
6370; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6371; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6372; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6373; GFX12-CU-NEXT:    s_endpgm
6374    ptr addrspace(1) %out, i32 %in, i32 %old) {
6375entry:
6376  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6377  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
6378  %val0 = extractvalue { i32, i1 } %val, 0
6379  store i32 %val0, ptr addrspace(1) %out, align 4
6380  ret void
6381}
6382
6383define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
6384; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6385; GFX6:       ; %bb.0: ; %entry
6386; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
6387; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6388; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
6389; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
6390; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6391; GFX6-NEXT:    s_mov_b32 s12, s5
6392; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
6393; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
6394; GFX6-NEXT:    s_mov_b32 s11, -1
6395; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
6396; GFX6-NEXT:    s_mov_b32 s5, s12
6397; GFX6-NEXT:    s_mov_b32 s6, s11
6398; GFX6-NEXT:    s_mov_b32 s7, s10
6399; GFX6-NEXT:    v_mov_b32_e32 v0, s9
6400; GFX6-NEXT:    v_mov_b32_e32 v2, s8
6401; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6402; GFX6-NEXT:    v_mov_b32_e32 v1, v2
6403; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6404; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6405; GFX6-NEXT:    s_waitcnt vmcnt(0)
6406; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6407; GFX6-NEXT:    s_endpgm
6408;
6409; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6410; GFX7:       ; %bb.0: ; %entry
6411; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6412; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6413; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6414; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6415; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6416; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6417; GFX7-NEXT:    s_mov_b32 s6, s4
6418; GFX7-NEXT:    s_mov_b32 s7, s5
6419; GFX7-NEXT:    s_mov_b32 s11, s12
6420; GFX7-NEXT:    s_mov_b32 s10, s13
6421; GFX7-NEXT:    s_add_u32 s6, s6, s11
6422; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6423; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6424; GFX7-NEXT:    s_mov_b32 s7, s10
6425; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6426; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6427; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6428; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6429; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6430; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6431; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6432; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6433; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6434; GFX7-NEXT:    s_waitcnt vmcnt(0)
6435; GFX7-NEXT:    flat_store_dword v[0:1], v2
6436; GFX7-NEXT:    s_endpgm
6437;
6438; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6439; GFX10-WGP:       ; %bb.0: ; %entry
6440; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
6441; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6442; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
6443; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
6444; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6445; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6446; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
6447; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6448; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
6449; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6450; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6451; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
6452; GFX10-WGP-NEXT:    s_endpgm
6453;
6454; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6455; GFX10-CU:       ; %bb.0: ; %entry
6456; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
6457; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6458; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
6459; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
6460; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6461; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6462; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
6463; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6464; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
6465; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6466; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6467; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
6468; GFX10-CU-NEXT:    s_endpgm
6469;
6470; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6471; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6472; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6473; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6474; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6475; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6476; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6477; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
6478; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
6479; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
6480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
6481; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
6482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
6483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
6484; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
6486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
6487; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6488; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
6489; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6490; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6491; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6492; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6493; SKIP-CACHE-INV-NEXT:    s_endpgm
6494;
6495; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6496; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6497; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6498; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6499; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6500; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6501; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6502; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6503; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6504; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6505; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6506; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6507; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6508; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6509; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6510;
6511; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6512; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6513; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6514; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6515; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6516; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6517; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6518; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6519; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6520; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6521; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6522; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6523; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6524; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6525; GFX90A-TGSPLIT-NEXT:    s_endpgm
6526;
6527; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6528; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6529; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6530; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6531; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6532; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6533; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6534; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6535; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6536; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6537; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6538; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6539; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6540; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6541; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6542;
6543; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6544; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6545; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6546; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6547; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6548; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6549; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6550; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6551; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6552; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6553; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6554; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6555; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6556; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6557; GFX940-TGSPLIT-NEXT:    s_endpgm
6558;
6559; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6560; GFX11-WGP:       ; %bb.0: ; %entry
6561; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
6562; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6563; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6564; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6565; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6566; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
6567; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
6568; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6569; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
6570; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6571; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
6572; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6573; GFX11-WGP-NEXT:    s_endpgm
6574;
6575; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6576; GFX11-CU:       ; %bb.0: ; %entry
6577; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
6578; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6579; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6580; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6581; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6582; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
6583; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
6584; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6585; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
6586; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6587; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6588; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6589; GFX11-CU-NEXT:    s_endpgm
6590;
6591; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6592; GFX12-WGP:       ; %bb.0: ; %entry
6593; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
6594; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6595; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6596; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6597; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6598; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
6599; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
6600; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6601; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
6602; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6603; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
6604; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6605; GFX12-WGP-NEXT:    s_endpgm
6606;
6607; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
6608; GFX12-CU:       ; %bb.0: ; %entry
6609; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
6610; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6611; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6612; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6613; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6614; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
6615; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
6616; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6617; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
6618; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6619; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6620; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6621; GFX12-CU-NEXT:    s_endpgm
6622   ptr addrspace(1) %out, i32 %in, i32 %old) {
6623entry:
6624  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6625  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
6626  %val0 = extractvalue { i32, i1 } %val, 0
6627  store i32 %val0, ptr addrspace(1) %out, align 4
6628  ret void
6629}
6630
6631define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
6632; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6633; GFX6:       ; %bb.0: ; %entry
6634; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
6635; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6636; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
6637; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
6638; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6639; GFX6-NEXT:    s_mov_b32 s12, s5
6640; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
6641; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
6642; GFX6-NEXT:    s_mov_b32 s11, -1
6643; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
6644; GFX6-NEXT:    s_mov_b32 s5, s12
6645; GFX6-NEXT:    s_mov_b32 s6, s11
6646; GFX6-NEXT:    s_mov_b32 s7, s10
6647; GFX6-NEXT:    v_mov_b32_e32 v0, s9
6648; GFX6-NEXT:    v_mov_b32_e32 v2, s8
6649; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6650; GFX6-NEXT:    v_mov_b32_e32 v1, v2
6651; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6652; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6653; GFX6-NEXT:    s_waitcnt vmcnt(0)
6654; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6655; GFX6-NEXT:    s_endpgm
6656;
6657; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6658; GFX7:       ; %bb.0: ; %entry
6659; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6660; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6661; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6662; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6663; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6664; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6665; GFX7-NEXT:    s_mov_b32 s6, s4
6666; GFX7-NEXT:    s_mov_b32 s7, s5
6667; GFX7-NEXT:    s_mov_b32 s11, s12
6668; GFX7-NEXT:    s_mov_b32 s10, s13
6669; GFX7-NEXT:    s_add_u32 s6, s6, s11
6670; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6671; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6672; GFX7-NEXT:    s_mov_b32 s7, s10
6673; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6674; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6675; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6676; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6677; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6678; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6679; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6680; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6681; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6682; GFX7-NEXT:    s_waitcnt vmcnt(0)
6683; GFX7-NEXT:    flat_store_dword v[0:1], v2
6684; GFX7-NEXT:    s_endpgm
6685;
6686; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6687; GFX10-WGP:       ; %bb.0: ; %entry
6688; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
6689; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6690; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
6691; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
6692; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6693; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6694; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
6695; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6696; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
6697; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6698; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6699; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
6700; GFX10-WGP-NEXT:    s_endpgm
6701;
6702; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6703; GFX10-CU:       ; %bb.0: ; %entry
6704; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
6705; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6706; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
6707; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
6708; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6709; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6710; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
6711; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6712; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
6713; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6714; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6715; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
6716; GFX10-CU-NEXT:    s_endpgm
6717;
6718; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6719; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6720; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6721; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6722; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6723; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6724; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6725; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
6726; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
6727; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
6728; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
6729; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
6730; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
6731; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
6732; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6733; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
6734; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
6735; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6736; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
6737; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6738; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6739; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6740; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6741; SKIP-CACHE-INV-NEXT:    s_endpgm
6742;
6743; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6744; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6745; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6746; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6747; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6748; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6749; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6750; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6751; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6752; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6753; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6754; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6755; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6756; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6757; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
6758;
6759; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6760; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
6761; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6762; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6763; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6764; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6765; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6766; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6767; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
6768; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6769; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6770; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
6771; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6772; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
6773; GFX90A-TGSPLIT-NEXT:    s_endpgm
6774;
6775; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6776; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
6777; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6778; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6779; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6780; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6781; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6782; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6783; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6784; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6785; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6786; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6787; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6788; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6789; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
6790;
6791; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6792; GFX940-TGSPLIT:       ; %bb.0: ; %entry
6793; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6794; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6795; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
6796; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
6797; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6798; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
6799; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
6800; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6801; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
6802; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
6803; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
6804; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
6805; GFX940-TGSPLIT-NEXT:    s_endpgm
6806;
6807; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6808; GFX11-WGP:       ; %bb.0: ; %entry
6809; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
6810; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6811; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6812; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6813; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6814; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
6815; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
6816; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6817; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
6818; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6819; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
6820; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6821; GFX11-WGP-NEXT:    s_endpgm
6822;
6823; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6824; GFX11-CU:       ; %bb.0: ; %entry
6825; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
6826; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6827; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6828; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6829; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
6830; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
6831; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
6832; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6833; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
6834; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
6835; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
6836; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6837; GFX11-CU-NEXT:    s_endpgm
6838;
6839; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6840; GFX12-WGP:       ; %bb.0: ; %entry
6841; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
6842; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6843; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
6844; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
6845; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
6846; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
6847; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
6848; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6849; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
6850; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6851; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
6852; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
6853; GFX12-WGP-NEXT:    s_endpgm
6854;
6855; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
6856; GFX12-CU:       ; %bb.0: ; %entry
6857; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
6858; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
6859; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
6860; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
6861; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
6862; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
6863; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
6864; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6865; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
6866; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
6867; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
6868; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
6869; GFX12-CU-NEXT:    s_endpgm
6870    ptr addrspace(1) %out, i32 %in, i32 %old) {
6871entry:
6872  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6873  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
6874  %val0 = extractvalue { i32, i1 } %val, 0
6875  store i32 %val0, ptr addrspace(1) %out, align 4
6876  ret void
6877}
6878
6879define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
6880; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6881; GFX6:       ; %bb.0: ; %entry
6882; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
6883; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6884; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
6885; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
6886; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6887; GFX6-NEXT:    s_mov_b32 s12, s5
6888; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
6889; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
6890; GFX6-NEXT:    s_mov_b32 s11, -1
6891; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
6892; GFX6-NEXT:    s_mov_b32 s5, s12
6893; GFX6-NEXT:    s_mov_b32 s6, s11
6894; GFX6-NEXT:    s_mov_b32 s7, s10
6895; GFX6-NEXT:    v_mov_b32_e32 v0, s9
6896; GFX6-NEXT:    v_mov_b32_e32 v2, s8
6897; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6898; GFX6-NEXT:    v_mov_b32_e32 v1, v2
6899; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6900; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6901; GFX6-NEXT:    s_waitcnt vmcnt(0)
6902; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6903; GFX6-NEXT:    s_endpgm
6904;
6905; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6906; GFX7:       ; %bb.0: ; %entry
6907; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
6908; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
6909; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
6910; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
6911; GFX7-NEXT:    s_mov_b64 s[12:13], 16
6912; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6913; GFX7-NEXT:    s_mov_b32 s6, s4
6914; GFX7-NEXT:    s_mov_b32 s7, s5
6915; GFX7-NEXT:    s_mov_b32 s11, s12
6916; GFX7-NEXT:    s_mov_b32 s10, s13
6917; GFX7-NEXT:    s_add_u32 s6, s6, s11
6918; GFX7-NEXT:    s_addc_u32 s10, s7, s10
6919; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6920; GFX7-NEXT:    s_mov_b32 s7, s10
6921; GFX7-NEXT:    v_mov_b32_e32 v2, s9
6922; GFX7-NEXT:    v_mov_b32_e32 v0, s8
6923; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6924; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6925; GFX7-NEXT:    v_mov_b32_e32 v0, s6
6926; GFX7-NEXT:    v_mov_b32_e32 v1, s7
6927; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6928; GFX7-NEXT:    v_mov_b32_e32 v0, s4
6929; GFX7-NEXT:    v_mov_b32_e32 v1, s5
6930; GFX7-NEXT:    s_waitcnt vmcnt(0)
6931; GFX7-NEXT:    flat_store_dword v[0:1], v2
6932; GFX7-NEXT:    s_endpgm
6933;
6934; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6935; GFX10-WGP:       ; %bb.0: ; %entry
6936; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
6937; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6938; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
6939; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
6940; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
6941; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
6942; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
6943; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6944; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
6945; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6946; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
6947; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
6948; GFX10-WGP-NEXT:    s_endpgm
6949;
6950; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6951; GFX10-CU:       ; %bb.0: ; %entry
6952; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
6953; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6954; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
6955; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
6956; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
6957; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
6958; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
6959; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
6960; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
6961; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
6962; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
6963; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
6964; GFX10-CU-NEXT:    s_endpgm
6965;
6966; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6967; SKIP-CACHE-INV:       ; %bb.0: ; %entry
6968; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
6969; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
6970; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
6971; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
6972; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
6973; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
6974; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
6975; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
6976; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
6977; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
6978; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
6979; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
6980; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
6981; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
6982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
6983; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
6984; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
6985; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
6986; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
6987; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
6988; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6989; SKIP-CACHE-INV-NEXT:    s_endpgm
6990;
6991; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
6992; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
6993; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
6994; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
6995; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
6996; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
6997; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
6998; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
6999; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7000; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7001; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7002; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7003; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7004; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7005; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7006;
7007; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7008; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7009; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7010; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7011; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7012; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7013; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7014; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7015; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7016; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7017; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7018; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7019; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7020; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7021; GFX90A-TGSPLIT-NEXT:    s_endpgm
7022;
7023; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7024; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7025; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7026; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7027; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7028; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7029; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7030; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7031; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7032; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7033; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7034; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7035; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7036; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7037; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7038;
7039; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7040; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7041; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7042; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7043; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7044; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7045; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7046; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7047; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7048; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7049; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7050; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7051; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7052; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7053; GFX940-TGSPLIT-NEXT:    s_endpgm
7054;
7055; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7056; GFX11-WGP:       ; %bb.0: ; %entry
7057; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
7058; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7059; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7060; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7061; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7062; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
7063; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
7064; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7065; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
7066; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7067; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
7068; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7069; GFX11-WGP-NEXT:    s_endpgm
7070;
7071; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7072; GFX11-CU:       ; %bb.0: ; %entry
7073; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
7074; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7075; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7076; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7077; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7078; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
7079; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
7080; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7081; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
7082; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7083; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7084; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7085; GFX11-CU-NEXT:    s_endpgm
7086;
7087; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7088; GFX12-WGP:       ; %bb.0: ; %entry
7089; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
7090; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7091; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7092; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7093; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7094; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
7095; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
7096; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7097; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
7098; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7099; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
7100; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7101; GFX12-WGP-NEXT:    s_endpgm
7102;
7103; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
7104; GFX12-CU:       ; %bb.0: ; %entry
7105; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
7106; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7107; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7108; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7109; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7110; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
7111; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
7112; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7113; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
7114; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7115; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7116; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7117; GFX12-CU-NEXT:    s_endpgm
7118    ptr addrspace(1) %out, i32 %in, i32 %old) {
7119entry:
7120  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7121  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
7122  %val0 = extractvalue { i32, i1 } %val, 0
7123  store i32 %val0, ptr addrspace(1) %out, align 4
7124  ret void
7125}
7126
7127define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
7128; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7129; GFX6:       ; %bb.0: ; %entry
7130; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
7131; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7132; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
7133; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
7134; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7135; GFX6-NEXT:    s_mov_b32 s12, s5
7136; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
7137; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
7138; GFX6-NEXT:    s_mov_b32 s11, -1
7139; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
7140; GFX6-NEXT:    s_mov_b32 s5, s12
7141; GFX6-NEXT:    s_mov_b32 s6, s11
7142; GFX6-NEXT:    s_mov_b32 s7, s10
7143; GFX6-NEXT:    v_mov_b32_e32 v0, s9
7144; GFX6-NEXT:    v_mov_b32_e32 v2, s8
7145; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7146; GFX6-NEXT:    v_mov_b32_e32 v1, v2
7147; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7148; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7149; GFX6-NEXT:    s_waitcnt vmcnt(0)
7150; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7151; GFX6-NEXT:    s_endpgm
7152;
7153; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7154; GFX7:       ; %bb.0: ; %entry
7155; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7156; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7157; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7158; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7159; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7160; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7161; GFX7-NEXT:    s_mov_b32 s6, s4
7162; GFX7-NEXT:    s_mov_b32 s7, s5
7163; GFX7-NEXT:    s_mov_b32 s11, s12
7164; GFX7-NEXT:    s_mov_b32 s10, s13
7165; GFX7-NEXT:    s_add_u32 s6, s6, s11
7166; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7167; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7168; GFX7-NEXT:    s_mov_b32 s7, s10
7169; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7170; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7171; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7172; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7173; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7174; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7175; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7176; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7177; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7178; GFX7-NEXT:    s_waitcnt vmcnt(0)
7179; GFX7-NEXT:    flat_store_dword v[0:1], v2
7180; GFX7-NEXT:    s_endpgm
7181;
7182; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7183; GFX10-WGP:       ; %bb.0: ; %entry
7184; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
7185; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7186; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
7187; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
7188; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7189; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7190; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
7191; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7192; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
7193; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7194; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7195; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
7196; GFX10-WGP-NEXT:    s_endpgm
7197;
7198; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7199; GFX10-CU:       ; %bb.0: ; %entry
7200; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
7201; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7202; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
7203; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
7204; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7205; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7206; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
7207; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7208; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
7209; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7210; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7211; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
7212; GFX10-CU-NEXT:    s_endpgm
7213;
7214; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7215; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7216; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7217; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7218; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7219; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7220; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7221; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
7222; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
7223; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
7224; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
7225; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
7226; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
7227; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
7228; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7229; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
7230; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
7231; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
7233; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7234; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7235; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7236; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7237; SKIP-CACHE-INV-NEXT:    s_endpgm
7238;
7239; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7240; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7241; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7242; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7243; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7244; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7245; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7246; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7247; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7248; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7249; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7250; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7251; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7252; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7253; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7254;
7255; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7256; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7258; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7259; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7260; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7261; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7262; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7263; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7264; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7265; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7266; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7267; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7268; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7269; GFX90A-TGSPLIT-NEXT:    s_endpgm
7270;
7271; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7272; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7273; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7274; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7275; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7276; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7277; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7278; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7279; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7280; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7281; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7282; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7283; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7284; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7285; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7286;
7287; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7288; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7289; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7290; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7291; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7292; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7293; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7294; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7295; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7296; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7297; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7298; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7299; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7300; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7301; GFX940-TGSPLIT-NEXT:    s_endpgm
7302;
7303; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7304; GFX11-WGP:       ; %bb.0: ; %entry
7305; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
7306; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7307; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7308; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7309; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7310; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
7311; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
7312; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7313; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
7314; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7315; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
7316; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7317; GFX11-WGP-NEXT:    s_endpgm
7318;
7319; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7320; GFX11-CU:       ; %bb.0: ; %entry
7321; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
7322; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7323; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7324; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7325; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7326; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
7327; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
7328; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7329; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
7330; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7331; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7332; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7333; GFX11-CU-NEXT:    s_endpgm
7334;
7335; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7336; GFX12-WGP:       ; %bb.0: ; %entry
7337; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
7338; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7339; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7340; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7341; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7342; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
7343; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
7344; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7345; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
7346; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7347; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
7348; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7349; GFX12-WGP-NEXT:    s_endpgm
7350;
7351; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
7352; GFX12-CU:       ; %bb.0: ; %entry
7353; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
7354; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7355; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7356; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7357; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7358; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
7359; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
7360; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7361; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
7362; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7363; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7364; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7365; GFX12-CU-NEXT:    s_endpgm
7366    ptr addrspace(1) %out, i32 %in, i32 %old) {
7367entry:
7368  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7369  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
7370  %val0 = extractvalue { i32, i1 } %val, 0
7371  store i32 %val0, ptr addrspace(1) %out, align 4
7372  ret void
7373}
7374
7375define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
7376; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7377; GFX6:       ; %bb.0: ; %entry
7378; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
7379; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7380; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
7381; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
7382; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7383; GFX6-NEXT:    s_mov_b32 s12, s5
7384; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
7385; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
7386; GFX6-NEXT:    s_mov_b32 s11, -1
7387; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
7388; GFX6-NEXT:    s_mov_b32 s5, s12
7389; GFX6-NEXT:    s_mov_b32 s6, s11
7390; GFX6-NEXT:    s_mov_b32 s7, s10
7391; GFX6-NEXT:    v_mov_b32_e32 v0, s9
7392; GFX6-NEXT:    v_mov_b32_e32 v2, s8
7393; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7394; GFX6-NEXT:    v_mov_b32_e32 v1, v2
7395; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7396; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7397; GFX6-NEXT:    s_waitcnt vmcnt(0)
7398; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7399; GFX6-NEXT:    s_endpgm
7400;
7401; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7402; GFX7:       ; %bb.0: ; %entry
7403; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7404; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7405; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7406; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7407; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7408; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7409; GFX7-NEXT:    s_mov_b32 s6, s4
7410; GFX7-NEXT:    s_mov_b32 s7, s5
7411; GFX7-NEXT:    s_mov_b32 s11, s12
7412; GFX7-NEXT:    s_mov_b32 s10, s13
7413; GFX7-NEXT:    s_add_u32 s6, s6, s11
7414; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7415; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7416; GFX7-NEXT:    s_mov_b32 s7, s10
7417; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7418; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7419; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7420; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7421; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7422; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7423; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7424; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7425; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7426; GFX7-NEXT:    s_waitcnt vmcnt(0)
7427; GFX7-NEXT:    flat_store_dword v[0:1], v2
7428; GFX7-NEXT:    s_endpgm
7429;
7430; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7431; GFX10-WGP:       ; %bb.0: ; %entry
7432; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
7433; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7434; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
7435; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
7436; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7437; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7438; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
7439; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7440; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
7441; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7442; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7443; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
7444; GFX10-WGP-NEXT:    s_endpgm
7445;
7446; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7447; GFX10-CU:       ; %bb.0: ; %entry
7448; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
7449; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7450; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
7451; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
7452; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7453; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7454; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
7455; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7456; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
7457; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7458; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7459; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
7460; GFX10-CU-NEXT:    s_endpgm
7461;
7462; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7463; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7464; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7465; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7466; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7467; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7468; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7469; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
7470; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
7471; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
7472; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
7473; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
7474; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
7475; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
7476; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7477; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
7478; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
7479; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7480; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
7481; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7482; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7483; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7484; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7485; SKIP-CACHE-INV-NEXT:    s_endpgm
7486;
7487; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7488; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7489; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7490; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7491; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7492; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7493; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7494; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7495; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7496; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7497; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7498; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7499; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7500; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7501; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7502;
7503; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7504; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7505; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7506; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7507; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7508; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7509; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7510; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7511; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7512; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7513; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7514; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7515; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7516; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7517; GFX90A-TGSPLIT-NEXT:    s_endpgm
7518;
7519; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7520; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7521; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7522; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7523; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7524; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7525; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7526; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7527; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7528; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7529; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7530; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7531; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7532; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7533; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7534;
7535; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7536; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7537; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7538; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7539; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7540; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7541; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7542; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7543; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7544; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7545; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7546; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7547; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7548; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7549; GFX940-TGSPLIT-NEXT:    s_endpgm
7550;
7551; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7552; GFX11-WGP:       ; %bb.0: ; %entry
7553; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
7554; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7555; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7556; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7557; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7558; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
7559; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
7560; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7561; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
7562; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7563; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
7564; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7565; GFX11-WGP-NEXT:    s_endpgm
7566;
7567; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7568; GFX11-CU:       ; %bb.0: ; %entry
7569; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
7570; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7571; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7572; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7573; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7574; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
7575; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
7576; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7577; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
7578; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7579; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7580; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7581; GFX11-CU-NEXT:    s_endpgm
7582;
7583; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7584; GFX12-WGP:       ; %bb.0: ; %entry
7585; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
7586; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7587; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7588; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7589; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7590; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
7591; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
7592; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7593; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
7594; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7595; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
7596; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7597; GFX12-WGP-NEXT:    s_endpgm
7598;
7599; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
7600; GFX12-CU:       ; %bb.0: ; %entry
7601; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
7602; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7603; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7604; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7605; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7606; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
7607; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
7608; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7609; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
7610; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7611; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7612; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7613; GFX12-CU-NEXT:    s_endpgm
7614    ptr addrspace(1) %out, i32 %in, i32 %old) {
7615entry:
7616  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7617  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
7618  %val0 = extractvalue { i32, i1 } %val, 0
7619  store i32 %val0, ptr addrspace(1) %out, align 4
7620  ret void
7621}
7622
7623define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
7624; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7625; GFX6:       ; %bb.0: ; %entry
7626; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
7627; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7628; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
7629; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
7630; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7631; GFX6-NEXT:    s_mov_b32 s12, s5
7632; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
7633; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
7634; GFX6-NEXT:    s_mov_b32 s11, -1
7635; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
7636; GFX6-NEXT:    s_mov_b32 s5, s12
7637; GFX6-NEXT:    s_mov_b32 s6, s11
7638; GFX6-NEXT:    s_mov_b32 s7, s10
7639; GFX6-NEXT:    v_mov_b32_e32 v0, s9
7640; GFX6-NEXT:    v_mov_b32_e32 v2, s8
7641; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7642; GFX6-NEXT:    v_mov_b32_e32 v1, v2
7643; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7644; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7645; GFX6-NEXT:    s_waitcnt vmcnt(0)
7646; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7647; GFX6-NEXT:    s_endpgm
7648;
7649; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7650; GFX7:       ; %bb.0: ; %entry
7651; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7652; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7653; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7654; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7655; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7656; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7657; GFX7-NEXT:    s_mov_b32 s6, s4
7658; GFX7-NEXT:    s_mov_b32 s7, s5
7659; GFX7-NEXT:    s_mov_b32 s11, s12
7660; GFX7-NEXT:    s_mov_b32 s10, s13
7661; GFX7-NEXT:    s_add_u32 s6, s6, s11
7662; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7663; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7664; GFX7-NEXT:    s_mov_b32 s7, s10
7665; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7666; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7667; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7668; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7669; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7670; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7671; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7672; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7673; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7674; GFX7-NEXT:    s_waitcnt vmcnt(0)
7675; GFX7-NEXT:    flat_store_dword v[0:1], v2
7676; GFX7-NEXT:    s_endpgm
7677;
7678; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7679; GFX10-WGP:       ; %bb.0: ; %entry
7680; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
7681; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7682; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
7683; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
7684; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7685; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7686; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
7687; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7688; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
7689; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7690; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7691; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
7692; GFX10-WGP-NEXT:    s_endpgm
7693;
7694; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7695; GFX10-CU:       ; %bb.0: ; %entry
7696; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
7697; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7698; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
7699; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
7700; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7701; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7702; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
7703; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7704; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
7705; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7706; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7707; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
7708; GFX10-CU-NEXT:    s_endpgm
7709;
7710; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7711; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7712; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7713; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7714; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7715; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7716; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7717; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
7718; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
7719; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
7720; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
7721; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
7722; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
7723; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
7724; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7725; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
7726; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
7727; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7728; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
7729; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7730; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7731; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7732; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7733; SKIP-CACHE-INV-NEXT:    s_endpgm
7734;
7735; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7736; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7737; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7738; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7739; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7740; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7741; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7742; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7743; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7744; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7745; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7746; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7747; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7748; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7749; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7750;
7751; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7752; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
7753; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7754; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7755; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7756; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7757; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7758; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7759; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7760; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7761; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7762; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7763; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7764; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7765; GFX90A-TGSPLIT-NEXT:    s_endpgm
7766;
7767; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7768; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
7769; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7770; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7771; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7772; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7773; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7774; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7775; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7776; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7777; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7778; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7779; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7780; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7781; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
7782;
7783; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7784; GFX940-TGSPLIT:       ; %bb.0: ; %entry
7785; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7786; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
7787; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
7788; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
7789; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7790; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
7791; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
7792; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7793; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7794; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
7795; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7796; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
7797; GFX940-TGSPLIT-NEXT:    s_endpgm
7798;
7799; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7800; GFX11-WGP:       ; %bb.0: ; %entry
7801; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
7802; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7803; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7804; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7805; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7806; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
7807; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
7808; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7809; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
7810; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7811; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
7812; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7813; GFX11-WGP-NEXT:    s_endpgm
7814;
7815; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7816; GFX11-CU:       ; %bb.0: ; %entry
7817; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
7818; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7819; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7820; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7821; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
7822; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
7823; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
7824; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7825; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
7826; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
7827; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
7828; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7829; GFX11-CU-NEXT:    s_endpgm
7830;
7831; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7832; GFX12-WGP:       ; %bb.0: ; %entry
7833; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
7834; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7835; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
7836; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
7837; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
7838; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
7839; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
7840; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7841; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
7842; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7843; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
7844; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
7845; GFX12-WGP-NEXT:    s_endpgm
7846;
7847; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
7848; GFX12-CU:       ; %bb.0: ; %entry
7849; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
7850; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
7851; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
7852; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
7853; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
7854; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
7855; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
7856; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7857; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
7858; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
7859; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
7860; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
7861; GFX12-CU-NEXT:    s_endpgm
7862    ptr addrspace(1) %out, i32 %in, i32 %old) {
7863entry:
7864  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7865  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
7866  %val0 = extractvalue { i32, i1 } %val, 0
7867  store i32 %val0, ptr addrspace(1) %out, align 4
7868  ret void
7869}
7870
7871define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
7872; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7873; GFX6:       ; %bb.0: ; %entry
7874; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
7875; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7876; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
7877; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
7878; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7879; GFX6-NEXT:    s_mov_b32 s12, s5
7880; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
7881; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
7882; GFX6-NEXT:    s_mov_b32 s11, -1
7883; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
7884; GFX6-NEXT:    s_mov_b32 s5, s12
7885; GFX6-NEXT:    s_mov_b32 s6, s11
7886; GFX6-NEXT:    s_mov_b32 s7, s10
7887; GFX6-NEXT:    v_mov_b32_e32 v0, s9
7888; GFX6-NEXT:    v_mov_b32_e32 v2, s8
7889; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7890; GFX6-NEXT:    v_mov_b32_e32 v1, v2
7891; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7892; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7893; GFX6-NEXT:    s_waitcnt vmcnt(0)
7894; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7895; GFX6-NEXT:    s_endpgm
7896;
7897; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7898; GFX7:       ; %bb.0: ; %entry
7899; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
7900; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
7901; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
7902; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
7903; GFX7-NEXT:    s_mov_b64 s[12:13], 16
7904; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7905; GFX7-NEXT:    s_mov_b32 s6, s4
7906; GFX7-NEXT:    s_mov_b32 s7, s5
7907; GFX7-NEXT:    s_mov_b32 s11, s12
7908; GFX7-NEXT:    s_mov_b32 s10, s13
7909; GFX7-NEXT:    s_add_u32 s6, s6, s11
7910; GFX7-NEXT:    s_addc_u32 s10, s7, s10
7911; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7912; GFX7-NEXT:    s_mov_b32 s7, s10
7913; GFX7-NEXT:    v_mov_b32_e32 v2, s9
7914; GFX7-NEXT:    v_mov_b32_e32 v0, s8
7915; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7916; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7917; GFX7-NEXT:    v_mov_b32_e32 v0, s6
7918; GFX7-NEXT:    v_mov_b32_e32 v1, s7
7919; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7920; GFX7-NEXT:    v_mov_b32_e32 v0, s4
7921; GFX7-NEXT:    v_mov_b32_e32 v1, s5
7922; GFX7-NEXT:    s_waitcnt vmcnt(0)
7923; GFX7-NEXT:    flat_store_dword v[0:1], v2
7924; GFX7-NEXT:    s_endpgm
7925;
7926; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7927; GFX10-WGP:       ; %bb.0: ; %entry
7928; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
7929; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7930; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
7931; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
7932; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
7933; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
7934; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
7935; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7936; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
7937; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7938; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
7939; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
7940; GFX10-WGP-NEXT:    s_endpgm
7941;
7942; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7943; GFX10-CU:       ; %bb.0: ; %entry
7944; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
7945; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7946; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
7947; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
7948; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
7949; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
7950; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
7951; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
7952; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
7953; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
7954; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
7955; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
7956; GFX10-CU-NEXT:    s_endpgm
7957;
7958; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7959; SKIP-CACHE-INV:       ; %bb.0: ; %entry
7960; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
7961; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
7962; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
7963; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
7964; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
7965; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
7966; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
7967; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
7968; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
7969; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
7970; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
7971; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
7972; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
7973; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
7974; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
7975; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
7976; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
7977; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
7978; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
7979; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
7980; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7981; SKIP-CACHE-INV-NEXT:    s_endpgm
7982;
7983; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
7984; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
7985; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
7986; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
7987; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
7988; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
7989; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
7990; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
7991; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
7992; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7993; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
7994; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
7995; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
7996; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
7997; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
7998;
7999; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8000; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8001; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8002; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8003; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8004; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8005; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8006; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8007; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8008; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8009; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8010; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8011; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8012; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8013; GFX90A-TGSPLIT-NEXT:    s_endpgm
8014;
8015; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8016; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8017; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8018; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8019; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8020; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8021; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8022; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8023; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8024; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8025; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8026; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8027; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8028; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8029; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8030;
8031; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8032; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8033; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8034; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8035; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8036; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8037; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8038; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8039; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8040; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8041; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8042; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8043; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8044; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8045; GFX940-TGSPLIT-NEXT:    s_endpgm
8046;
8047; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8048; GFX11-WGP:       ; %bb.0: ; %entry
8049; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
8050; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8051; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8052; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8053; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8054; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
8055; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
8056; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8057; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
8058; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8059; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
8060; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8061; GFX11-WGP-NEXT:    s_endpgm
8062;
8063; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8064; GFX11-CU:       ; %bb.0: ; %entry
8065; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
8066; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8067; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8068; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8069; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8070; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
8071; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
8072; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8073; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
8074; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8075; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8076; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8077; GFX11-CU-NEXT:    s_endpgm
8078;
8079; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8080; GFX12-WGP:       ; %bb.0: ; %entry
8081; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
8082; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8083; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8084; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8085; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8086; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
8087; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
8088; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8089; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
8090; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8091; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
8092; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8093; GFX12-WGP-NEXT:    s_endpgm
8094;
8095; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
8096; GFX12-CU:       ; %bb.0: ; %entry
8097; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
8098; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8099; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8100; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8101; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8102; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
8103; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
8104; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8105; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
8106; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8107; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8108; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8109; GFX12-CU-NEXT:    s_endpgm
8110    ptr addrspace(1) %out, i32 %in, i32 %old) {
8111entry:
8112  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8113  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
8114  %val0 = extractvalue { i32, i1 } %val, 0
8115  store i32 %val0, ptr addrspace(1) %out, align 4
8116  ret void
8117}
8118
8119define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
8120; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8121; GFX6:       ; %bb.0: ; %entry
8122; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
8123; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8124; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
8125; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
8126; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8127; GFX6-NEXT:    s_mov_b32 s12, s5
8128; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
8129; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
8130; GFX6-NEXT:    s_mov_b32 s11, -1
8131; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
8132; GFX6-NEXT:    s_mov_b32 s5, s12
8133; GFX6-NEXT:    s_mov_b32 s6, s11
8134; GFX6-NEXT:    s_mov_b32 s7, s10
8135; GFX6-NEXT:    v_mov_b32_e32 v0, s9
8136; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8137; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8138; GFX6-NEXT:    v_mov_b32_e32 v1, v2
8139; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
8140; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8141; GFX6-NEXT:    s_waitcnt vmcnt(0)
8142; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8143; GFX6-NEXT:    s_endpgm
8144;
8145; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8146; GFX7:       ; %bb.0: ; %entry
8147; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8148; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8149; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8150; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8151; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8152; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8153; GFX7-NEXT:    s_mov_b32 s6, s4
8154; GFX7-NEXT:    s_mov_b32 s7, s5
8155; GFX7-NEXT:    s_mov_b32 s11, s12
8156; GFX7-NEXT:    s_mov_b32 s10, s13
8157; GFX7-NEXT:    s_add_u32 s6, s6, s11
8158; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8159; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8160; GFX7-NEXT:    s_mov_b32 s7, s10
8161; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8162; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8163; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8164; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8165; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8166; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8167; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8168; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8169; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8170; GFX7-NEXT:    s_waitcnt vmcnt(0)
8171; GFX7-NEXT:    flat_store_dword v[0:1], v2
8172; GFX7-NEXT:    s_endpgm
8173;
8174; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8175; GFX10-WGP:       ; %bb.0: ; %entry
8176; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
8177; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8178; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
8179; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
8180; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8181; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8182; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
8183; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8184; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
8185; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8186; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8187; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
8188; GFX10-WGP-NEXT:    s_endpgm
8189;
8190; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8191; GFX10-CU:       ; %bb.0: ; %entry
8192; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
8193; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8194; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
8195; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
8196; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8197; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8198; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
8199; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8200; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
8201; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8202; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8203; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
8204; GFX10-CU-NEXT:    s_endpgm
8205;
8206; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8207; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8208; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8209; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8210; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8211; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8212; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8213; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
8214; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
8215; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
8216; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
8217; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
8218; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
8219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
8220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8221; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
8222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
8223; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8224; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
8225; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
8226; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8227; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8228; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8229; SKIP-CACHE-INV-NEXT:    s_endpgm
8230;
8231; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8232; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8234; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8235; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8236; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8237; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8238; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8239; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8240; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8241; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8242; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8243; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8244; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8245; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8246;
8247; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8248; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8249; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8250; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8251; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8252; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8253; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8254; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8255; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8256; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8257; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8258; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8259; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8260; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8261; GFX90A-TGSPLIT-NEXT:    s_endpgm
8262;
8263; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8264; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8265; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8266; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8267; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8268; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8269; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8270; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8271; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8272; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8273; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8274; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8275; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8276; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8277; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8278;
8279; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8280; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8281; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8282; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8283; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8284; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8285; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8286; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8287; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8288; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8289; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8290; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8291; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8292; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8293; GFX940-TGSPLIT-NEXT:    s_endpgm
8294;
8295; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8296; GFX11-WGP:       ; %bb.0: ; %entry
8297; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
8298; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8299; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8300; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8301; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8302; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
8303; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
8304; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8305; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
8306; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8307; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
8308; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8309; GFX11-WGP-NEXT:    s_endpgm
8310;
8311; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8312; GFX11-CU:       ; %bb.0: ; %entry
8313; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
8314; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8315; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8316; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8317; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8318; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
8319; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
8320; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8321; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
8322; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8323; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8324; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8325; GFX11-CU-NEXT:    s_endpgm
8326;
8327; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8328; GFX12-WGP:       ; %bb.0: ; %entry
8329; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
8330; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8331; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8332; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8333; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8334; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
8335; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
8336; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8337; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
8338; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8339; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
8340; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8341; GFX12-WGP-NEXT:    s_endpgm
8342;
8343; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
8344; GFX12-CU:       ; %bb.0: ; %entry
8345; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
8346; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8347; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8348; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8349; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8350; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
8351; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
8352; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8353; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
8354; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8355; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8356; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8357; GFX12-CU-NEXT:    s_endpgm
8358    ptr addrspace(1) %out, i32 %in, i32 %old) {
8359entry:
8360  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8361  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
8362  %val0 = extractvalue { i32, i1 } %val, 0
8363  store i32 %val0, ptr addrspace(1) %out, align 4
8364  ret void
8365}
8366
8367define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
8368; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8369; GFX6:       ; %bb.0: ; %entry
8370; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
8371; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8372; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
8373; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
8374; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8375; GFX6-NEXT:    s_mov_b32 s12, s5
8376; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
8377; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
8378; GFX6-NEXT:    s_mov_b32 s11, -1
8379; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
8380; GFX6-NEXT:    s_mov_b32 s5, s12
8381; GFX6-NEXT:    s_mov_b32 s6, s11
8382; GFX6-NEXT:    s_mov_b32 s7, s10
8383; GFX6-NEXT:    v_mov_b32_e32 v0, s9
8384; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8385; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8386; GFX6-NEXT:    v_mov_b32_e32 v1, v2
8387; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
8388; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8389; GFX6-NEXT:    s_waitcnt vmcnt(0)
8390; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8391; GFX6-NEXT:    s_endpgm
8392;
8393; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8394; GFX7:       ; %bb.0: ; %entry
8395; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8396; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8397; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8398; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8399; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8400; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8401; GFX7-NEXT:    s_mov_b32 s6, s4
8402; GFX7-NEXT:    s_mov_b32 s7, s5
8403; GFX7-NEXT:    s_mov_b32 s11, s12
8404; GFX7-NEXT:    s_mov_b32 s10, s13
8405; GFX7-NEXT:    s_add_u32 s6, s6, s11
8406; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8407; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8408; GFX7-NEXT:    s_mov_b32 s7, s10
8409; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8410; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8411; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8412; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8413; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8414; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8415; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8416; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8417; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8418; GFX7-NEXT:    s_waitcnt vmcnt(0)
8419; GFX7-NEXT:    flat_store_dword v[0:1], v2
8420; GFX7-NEXT:    s_endpgm
8421;
8422; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8423; GFX10-WGP:       ; %bb.0: ; %entry
8424; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
8425; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8426; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
8427; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
8428; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8429; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8430; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
8431; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8432; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
8433; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8434; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8435; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
8436; GFX10-WGP-NEXT:    s_endpgm
8437;
8438; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8439; GFX10-CU:       ; %bb.0: ; %entry
8440; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
8441; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8442; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
8443; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
8444; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8445; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8446; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
8447; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8448; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
8449; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8450; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8451; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
8452; GFX10-CU-NEXT:    s_endpgm
8453;
8454; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8455; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8456; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8457; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8458; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8459; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8460; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8461; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
8462; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
8463; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
8464; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
8465; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
8466; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
8467; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
8468; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8469; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
8470; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
8471; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8472; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
8473; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
8474; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8475; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8476; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8477; SKIP-CACHE-INV-NEXT:    s_endpgm
8478;
8479; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8480; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8481; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8482; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8483; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8484; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8485; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8486; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8487; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8488; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8489; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8490; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8491; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8492; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8493; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8494;
8495; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8496; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8497; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8498; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8499; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8500; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8501; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8502; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8503; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8504; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8505; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8506; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8507; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8508; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8509; GFX90A-TGSPLIT-NEXT:    s_endpgm
8510;
8511; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8512; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8513; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8514; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8515; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8516; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8517; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8518; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8519; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8520; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8521; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8522; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8523; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8524; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8525; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8526;
8527; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8528; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8529; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8530; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8531; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8532; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8533; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8534; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8535; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8536; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8537; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8538; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8539; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8540; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8541; GFX940-TGSPLIT-NEXT:    s_endpgm
8542;
8543; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8544; GFX11-WGP:       ; %bb.0: ; %entry
8545; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
8546; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8547; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8548; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8549; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8550; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
8551; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
8552; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8553; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
8554; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8555; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
8556; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8557; GFX11-WGP-NEXT:    s_endpgm
8558;
8559; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8560; GFX11-CU:       ; %bb.0: ; %entry
8561; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
8562; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8563; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8564; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8565; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8566; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
8567; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
8568; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8569; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
8570; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8571; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8572; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8573; GFX11-CU-NEXT:    s_endpgm
8574;
8575; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8576; GFX12-WGP:       ; %bb.0: ; %entry
8577; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
8578; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8579; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8580; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8581; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8582; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
8583; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
8584; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8585; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
8586; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8587; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
8588; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8589; GFX12-WGP-NEXT:    s_endpgm
8590;
8591; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
8592; GFX12-CU:       ; %bb.0: ; %entry
8593; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
8594; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8595; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8596; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8597; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8598; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
8599; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
8600; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8601; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
8602; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8603; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8604; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8605; GFX12-CU-NEXT:    s_endpgm
8606    ptr addrspace(1) %out, i32 %in, i32 %old) {
8607entry:
8608  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8609  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
8610  %val0 = extractvalue { i32, i1 } %val, 0
8611  store i32 %val0, ptr addrspace(1) %out, align 4
8612  ret void
8613}
8614
8615define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
8616; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8617; GFX6:       ; %bb.0: ; %entry
8618; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
8619; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8620; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
8621; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
8622; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8623; GFX6-NEXT:    s_mov_b32 s12, s5
8624; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
8625; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
8626; GFX6-NEXT:    s_mov_b32 s11, -1
8627; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
8628; GFX6-NEXT:    s_mov_b32 s5, s12
8629; GFX6-NEXT:    s_mov_b32 s6, s11
8630; GFX6-NEXT:    s_mov_b32 s7, s10
8631; GFX6-NEXT:    v_mov_b32_e32 v0, s9
8632; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8633; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8634; GFX6-NEXT:    v_mov_b32_e32 v1, v2
8635; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
8636; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8637; GFX6-NEXT:    s_waitcnt vmcnt(0)
8638; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8639; GFX6-NEXT:    s_endpgm
8640;
8641; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8642; GFX7:       ; %bb.0: ; %entry
8643; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8644; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8645; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8646; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8647; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8648; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8649; GFX7-NEXT:    s_mov_b32 s6, s4
8650; GFX7-NEXT:    s_mov_b32 s7, s5
8651; GFX7-NEXT:    s_mov_b32 s11, s12
8652; GFX7-NEXT:    s_mov_b32 s10, s13
8653; GFX7-NEXT:    s_add_u32 s6, s6, s11
8654; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8655; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8656; GFX7-NEXT:    s_mov_b32 s7, s10
8657; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8658; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8659; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8660; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8661; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8662; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8663; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8664; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8665; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8666; GFX7-NEXT:    s_waitcnt vmcnt(0)
8667; GFX7-NEXT:    flat_store_dword v[0:1], v2
8668; GFX7-NEXT:    s_endpgm
8669;
8670; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8671; GFX10-WGP:       ; %bb.0: ; %entry
8672; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
8673; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8674; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
8675; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
8676; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8677; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8678; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
8679; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8680; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
8681; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8682; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8683; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
8684; GFX10-WGP-NEXT:    s_endpgm
8685;
8686; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8687; GFX10-CU:       ; %bb.0: ; %entry
8688; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
8689; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8690; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
8691; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
8692; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8693; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8694; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
8695; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8696; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
8697; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8698; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8699; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
8700; GFX10-CU-NEXT:    s_endpgm
8701;
8702; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8703; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8704; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8705; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8706; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8707; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8708; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8709; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
8710; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
8711; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
8712; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
8713; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
8714; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
8715; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
8716; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8717; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
8718; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
8719; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8720; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
8721; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
8722; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8723; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8724; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8725; SKIP-CACHE-INV-NEXT:    s_endpgm
8726;
8727; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8728; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8729; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8730; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8731; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8732; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8733; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8734; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8735; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8736; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8737; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8738; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8739; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8740; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8741; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8742;
8743; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8744; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8745; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8746; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8747; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8748; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8749; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8750; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8751; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8752; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8753; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8754; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8755; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8756; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8757; GFX90A-TGSPLIT-NEXT:    s_endpgm
8758;
8759; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8760; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
8761; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8762; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8763; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8764; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8765; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8766; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8767; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8768; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8769; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8770; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8771; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8772; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8773; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
8774;
8775; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8776; GFX940-TGSPLIT:       ; %bb.0: ; %entry
8777; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8778; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
8779; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
8780; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
8781; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8782; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
8783; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
8784; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8785; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8786; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
8787; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8788; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
8789; GFX940-TGSPLIT-NEXT:    s_endpgm
8790;
8791; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8792; GFX11-WGP:       ; %bb.0: ; %entry
8793; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
8794; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8795; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8796; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8797; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8798; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
8799; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
8800; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8801; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
8802; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8803; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
8804; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8805; GFX11-WGP-NEXT:    s_endpgm
8806;
8807; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8808; GFX11-CU:       ; %bb.0: ; %entry
8809; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
8810; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8811; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8812; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8813; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
8814; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
8815; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
8816; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8817; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
8818; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
8819; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
8820; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8821; GFX11-CU-NEXT:    s_endpgm
8822;
8823; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8824; GFX12-WGP:       ; %bb.0: ; %entry
8825; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
8826; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8827; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
8828; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
8829; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
8830; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
8831; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
8832; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8833; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
8834; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8835; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
8836; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
8837; GFX12-WGP-NEXT:    s_endpgm
8838;
8839; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
8840; GFX12-CU:       ; %bb.0: ; %entry
8841; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
8842; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
8843; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
8844; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
8845; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
8846; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
8847; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
8848; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8849; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
8850; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
8851; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
8852; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
8853; GFX12-CU-NEXT:    s_endpgm
8854    ptr addrspace(1) %out, i32 %in, i32 %old) {
8855entry:
8856  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8857  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
8858  %val0 = extractvalue { i32, i1 } %val, 0
8859  store i32 %val0, ptr addrspace(1) %out, align 4
8860  ret void
8861}
8862
8863define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
8864; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8865; GFX6:       ; %bb.0: ; %entry
8866; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
8867; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8868; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
8869; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
8870; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8871; GFX6-NEXT:    s_mov_b32 s12, s5
8872; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
8873; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
8874; GFX6-NEXT:    s_mov_b32 s11, -1
8875; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
8876; GFX6-NEXT:    s_mov_b32 s5, s12
8877; GFX6-NEXT:    s_mov_b32 s6, s11
8878; GFX6-NEXT:    s_mov_b32 s7, s10
8879; GFX6-NEXT:    v_mov_b32_e32 v0, s9
8880; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8881; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8882; GFX6-NEXT:    v_mov_b32_e32 v1, v2
8883; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
8884; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8885; GFX6-NEXT:    s_waitcnt vmcnt(0)
8886; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8887; GFX6-NEXT:    s_endpgm
8888;
8889; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8890; GFX7:       ; %bb.0: ; %entry
8891; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
8892; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
8893; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
8894; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
8895; GFX7-NEXT:    s_mov_b64 s[12:13], 16
8896; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8897; GFX7-NEXT:    s_mov_b32 s6, s4
8898; GFX7-NEXT:    s_mov_b32 s7, s5
8899; GFX7-NEXT:    s_mov_b32 s11, s12
8900; GFX7-NEXT:    s_mov_b32 s10, s13
8901; GFX7-NEXT:    s_add_u32 s6, s6, s11
8902; GFX7-NEXT:    s_addc_u32 s10, s7, s10
8903; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8904; GFX7-NEXT:    s_mov_b32 s7, s10
8905; GFX7-NEXT:    v_mov_b32_e32 v2, s9
8906; GFX7-NEXT:    v_mov_b32_e32 v0, s8
8907; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8908; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8909; GFX7-NEXT:    v_mov_b32_e32 v0, s6
8910; GFX7-NEXT:    v_mov_b32_e32 v1, s7
8911; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8912; GFX7-NEXT:    v_mov_b32_e32 v0, s4
8913; GFX7-NEXT:    v_mov_b32_e32 v1, s5
8914; GFX7-NEXT:    s_waitcnt vmcnt(0)
8915; GFX7-NEXT:    flat_store_dword v[0:1], v2
8916; GFX7-NEXT:    s_endpgm
8917;
8918; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8919; GFX10-WGP:       ; %bb.0: ; %entry
8920; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
8921; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8922; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
8923; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
8924; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
8925; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
8926; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
8927; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8928; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
8929; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8930; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
8931; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
8932; GFX10-WGP-NEXT:    s_endpgm
8933;
8934; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8935; GFX10-CU:       ; %bb.0: ; %entry
8936; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
8937; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8938; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
8939; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
8940; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
8941; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
8942; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
8943; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
8944; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
8945; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
8946; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
8947; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
8948; GFX10-CU-NEXT:    s_endpgm
8949;
8950; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8951; SKIP-CACHE-INV:       ; %bb.0: ; %entry
8952; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
8953; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
8954; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
8955; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
8956; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
8957; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
8958; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
8959; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
8960; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
8961; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
8962; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
8963; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
8964; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
8965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
8966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
8967; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
8968; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
8969; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
8970; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
8971; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
8972; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8973; SKIP-CACHE-INV-NEXT:    s_endpgm
8974;
8975; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8976; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
8977; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8978; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8979; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8980; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8981; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8982; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8983; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
8984; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8985; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
8986; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
8987; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
8988; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
8989; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
8990;
8991; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
8992; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
8993; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
8994; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
8995; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
8996; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
8997; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
8998; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
8999; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
9000; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9001; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9002; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
9003; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9004; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9005; GFX90A-TGSPLIT-NEXT:    s_endpgm
9006;
9007; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9008; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9009; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9010; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9011; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9012; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9013; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9014; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9015; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9016; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9017; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9018; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9019; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9020; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9021; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9022;
9023; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9024; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9025; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9026; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9027; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9028; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9029; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9030; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9031; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9032; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9033; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9034; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9035; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9036; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9037; GFX940-TGSPLIT-NEXT:    s_endpgm
9038;
9039; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9040; GFX11-WGP:       ; %bb.0: ; %entry
9041; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
9042; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9043; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9044; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9045; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9046; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
9047; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
9048; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9049; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
9050; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9051; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
9052; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9053; GFX11-WGP-NEXT:    s_endpgm
9054;
9055; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9056; GFX11-CU:       ; %bb.0: ; %entry
9057; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
9058; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9059; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9060; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9061; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9062; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
9063; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
9064; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9065; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
9066; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9067; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9068; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9069; GFX11-CU-NEXT:    s_endpgm
9070;
9071; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9072; GFX12-WGP:       ; %bb.0: ; %entry
9073; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
9074; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9075; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9076; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9077; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9078; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
9079; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
9080; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9081; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
9082; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9083; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
9084; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9085; GFX12-WGP-NEXT:    s_endpgm
9086;
9087; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
9088; GFX12-CU:       ; %bb.0: ; %entry
9089; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
9090; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9091; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9092; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9093; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9094; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
9095; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
9096; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9097; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
9098; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9099; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9100; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9101; GFX12-CU-NEXT:    s_endpgm
9102    ptr addrspace(1) %out, i32 %in, i32 %old) {
9103entry:
9104  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9105  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
9106  %val0 = extractvalue { i32, i1 } %val, 0
9107  store i32 %val0, ptr addrspace(1) %out, align 4
9108  ret void
9109}
9110
9111define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
9112; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9113; GFX6:       ; %bb.0: ; %entry
9114; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
9115; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9116; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
9117; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
9118; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9119; GFX6-NEXT:    s_mov_b32 s12, s5
9120; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9121; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
9122; GFX6-NEXT:    s_mov_b32 s11, -1
9123; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9124; GFX6-NEXT:    s_mov_b32 s5, s12
9125; GFX6-NEXT:    s_mov_b32 s6, s11
9126; GFX6-NEXT:    s_mov_b32 s7, s10
9127; GFX6-NEXT:    v_mov_b32_e32 v0, s9
9128; GFX6-NEXT:    v_mov_b32_e32 v2, s8
9129; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
9130; GFX6-NEXT:    v_mov_b32_e32 v1, v2
9131; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
9132; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
9133; GFX6-NEXT:    s_waitcnt vmcnt(0)
9134; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
9135; GFX6-NEXT:    s_endpgm
9136;
9137; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9138; GFX7:       ; %bb.0: ; %entry
9139; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9140; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9141; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9142; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9143; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9144; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9145; GFX7-NEXT:    s_mov_b32 s6, s4
9146; GFX7-NEXT:    s_mov_b32 s7, s5
9147; GFX7-NEXT:    s_mov_b32 s11, s12
9148; GFX7-NEXT:    s_mov_b32 s10, s13
9149; GFX7-NEXT:    s_add_u32 s6, s6, s11
9150; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9151; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9152; GFX7-NEXT:    s_mov_b32 s7, s10
9153; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9154; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9155; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9156; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9157; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9158; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9159; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9160; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9161; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9162; GFX7-NEXT:    s_waitcnt vmcnt(0)
9163; GFX7-NEXT:    flat_store_dword v[0:1], v2
9164; GFX7-NEXT:    s_endpgm
9165;
9166; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9167; GFX10-WGP:       ; %bb.0: ; %entry
9168; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
9169; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9170; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
9171; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
9172; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9173; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9174; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
9175; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9176; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
9177; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
9178; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9179; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
9180; GFX10-WGP-NEXT:    s_endpgm
9181;
9182; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9183; GFX10-CU:       ; %bb.0: ; %entry
9184; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
9185; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9186; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
9187; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
9188; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9189; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9190; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
9191; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9192; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
9193; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
9194; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9195; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
9196; GFX10-CU-NEXT:    s_endpgm
9197;
9198; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9199; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9200; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9201; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9202; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9203; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9204; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9205; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
9206; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
9207; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
9208; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
9209; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
9210; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
9211; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
9212; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
9214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
9215; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
9216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
9217; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
9218; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
9219; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9220; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9221; SKIP-CACHE-INV-NEXT:    s_endpgm
9222;
9223; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9224; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9225; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9226; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9227; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9228; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9229; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9230; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9231; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
9232; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9233; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9234; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
9235; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9236; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9237; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9238;
9239; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9240; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9241; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9242; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9243; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9244; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9245; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9246; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9247; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
9248; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9249; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9250; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
9251; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9252; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9253; GFX90A-TGSPLIT-NEXT:    s_endpgm
9254;
9255; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9256; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9257; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9258; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9259; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9260; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9261; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9262; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9263; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9264; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9265; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9266; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9267; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9268; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9269; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9270;
9271; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9272; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9273; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9274; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9275; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9276; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9277; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9278; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9279; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9280; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9281; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9282; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9283; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9284; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9285; GFX940-TGSPLIT-NEXT:    s_endpgm
9286;
9287; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9288; GFX11-WGP:       ; %bb.0: ; %entry
9289; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
9290; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9291; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9292; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9293; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9294; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
9295; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
9296; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9297; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
9298; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9299; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
9300; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9301; GFX11-WGP-NEXT:    s_endpgm
9302;
9303; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9304; GFX11-CU:       ; %bb.0: ; %entry
9305; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
9306; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9307; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9308; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9309; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9310; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
9311; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
9312; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9313; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
9314; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9315; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9316; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9317; GFX11-CU-NEXT:    s_endpgm
9318;
9319; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9320; GFX12-WGP:       ; %bb.0: ; %entry
9321; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
9322; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9323; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9324; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9325; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9326; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
9327; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
9328; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9329; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
9330; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9331; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
9332; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9333; GFX12-WGP-NEXT:    s_endpgm
9334;
9335; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
9336; GFX12-CU:       ; %bb.0: ; %entry
9337; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
9338; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9339; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9340; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9341; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9342; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
9343; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
9344; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9345; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
9346; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9347; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9348; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9349; GFX12-CU-NEXT:    s_endpgm
9350    ptr addrspace(1) %out, i32 %in, i32 %old) {
9351entry:
9352  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9353  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
9354  %val0 = extractvalue { i32, i1 } %val, 0
9355  store i32 %val0, ptr addrspace(1) %out, align 4
9356  ret void
9357}
9358
9359define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
9360; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9361; GFX6:       ; %bb.0: ; %entry
9362; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
9363; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9364; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
9365; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
9366; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9367; GFX6-NEXT:    s_mov_b32 s12, s5
9368; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9369; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
9370; GFX6-NEXT:    s_mov_b32 s11, -1
9371; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9372; GFX6-NEXT:    s_mov_b32 s5, s12
9373; GFX6-NEXT:    s_mov_b32 s6, s11
9374; GFX6-NEXT:    s_mov_b32 s7, s10
9375; GFX6-NEXT:    v_mov_b32_e32 v0, s9
9376; GFX6-NEXT:    v_mov_b32_e32 v2, s8
9377; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
9378; GFX6-NEXT:    v_mov_b32_e32 v1, v2
9379; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
9380; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
9381; GFX6-NEXT:    s_waitcnt vmcnt(0)
9382; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
9383; GFX6-NEXT:    s_endpgm
9384;
9385; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9386; GFX7:       ; %bb.0: ; %entry
9387; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
9388; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
9389; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
9390; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
9391; GFX7-NEXT:    s_mov_b64 s[12:13], 16
9392; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9393; GFX7-NEXT:    s_mov_b32 s6, s4
9394; GFX7-NEXT:    s_mov_b32 s7, s5
9395; GFX7-NEXT:    s_mov_b32 s11, s12
9396; GFX7-NEXT:    s_mov_b32 s10, s13
9397; GFX7-NEXT:    s_add_u32 s6, s6, s11
9398; GFX7-NEXT:    s_addc_u32 s10, s7, s10
9399; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9400; GFX7-NEXT:    s_mov_b32 s7, s10
9401; GFX7-NEXT:    v_mov_b32_e32 v2, s9
9402; GFX7-NEXT:    v_mov_b32_e32 v0, s8
9403; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9404; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9405; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9406; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9407; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9408; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9409; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9410; GFX7-NEXT:    s_waitcnt vmcnt(0)
9411; GFX7-NEXT:    flat_store_dword v[0:1], v2
9412; GFX7-NEXT:    s_endpgm
9413;
9414; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9415; GFX10-WGP:       ; %bb.0: ; %entry
9416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
9417; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9418; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
9419; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
9420; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9421; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
9422; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
9423; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9424; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
9425; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
9426; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9427; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
9428; GFX10-WGP-NEXT:    s_endpgm
9429;
9430; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9431; GFX10-CU:       ; %bb.0: ; %entry
9432; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
9433; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9434; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
9435; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
9436; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9437; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
9438; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
9439; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9440; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
9441; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
9442; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9443; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
9444; GFX10-CU-NEXT:    s_endpgm
9445;
9446; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9447; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9448; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
9449; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
9450; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
9451; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
9452; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9453; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
9454; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
9455; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
9456; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
9457; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
9458; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
9459; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
9460; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
9461; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
9462; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
9463; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
9464; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
9465; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
9466; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
9467; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9468; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9469; SKIP-CACHE-INV-NEXT:    s_endpgm
9470;
9471; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9472; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9473; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9474; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9475; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9476; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9477; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9478; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9479; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
9480; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9481; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9482; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
9483; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9484; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9485; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9486;
9487; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9488; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9489; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9490; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
9491; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
9492; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
9493; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9494; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
9495; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
9496; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9497; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9498; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
9499; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9500; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9501; GFX90A-TGSPLIT-NEXT:    s_endpgm
9502;
9503; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9504; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9505; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9506; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9507; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9508; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9509; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9510; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9511; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9512; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9513; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9514; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9515; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9516; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9517; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9518;
9519; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9520; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9521; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9522; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
9523; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
9524; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
9525; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9526; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
9527; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
9528; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9529; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
9530; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
9531; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9532; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9533; GFX940-TGSPLIT-NEXT:    s_endpgm
9534;
9535; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9536; GFX11-WGP:       ; %bb.0: ; %entry
9537; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
9538; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9539; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9540; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9541; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9542; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
9543; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
9544; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9545; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
9546; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9547; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
9548; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9549; GFX11-WGP-NEXT:    s_endpgm
9550;
9551; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9552; GFX11-CU:       ; %bb.0: ; %entry
9553; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
9554; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9555; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9556; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9557; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9558; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
9559; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
9560; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9561; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
9562; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
9563; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9564; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9565; GFX11-CU-NEXT:    s_endpgm
9566;
9567; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9568; GFX12-WGP:       ; %bb.0: ; %entry
9569; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
9570; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9571; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
9572; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
9573; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9574; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
9575; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
9576; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9577; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
9578; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9579; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
9580; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9581; GFX12-WGP-NEXT:    s_endpgm
9582;
9583; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
9584; GFX12-CU:       ; %bb.0: ; %entry
9585; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
9586; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
9587; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
9588; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
9589; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9590; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
9591; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
9592; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
9593; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
9594; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
9595; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9596; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9597; GFX12-CU-NEXT:    s_endpgm
9598    ptr addrspace(1) %out, i32 %in, i32 %old) {
9599entry:
9600  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9601  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
9602  %val0 = extractvalue { i32, i1 } %val, 0
9603  store i32 %val0, ptr addrspace(1) %out, align 4
9604  ret void
9605}
9606
9607define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
9608; GFX6-LABEL: global_wavefront_one_as_unordered_load:
9609; GFX6:       ; %bb.0: ; %entry
9610; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
9611; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
9612; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
9613; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9614; GFX6-NEXT:    s_mov_b32 s6, s9
9615; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
9616; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
9617; GFX6-NEXT:    s_mov_b32 s13, -1
9618; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
9619; GFX6-NEXT:    s_mov_b32 s9, s6
9620; GFX6-NEXT:    s_mov_b32 s10, s13
9621; GFX6-NEXT:    s_mov_b32 s11, s12
9622; GFX6-NEXT:    s_mov_b32 s14, s5
9623; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9624; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9625; GFX6-NEXT:    s_mov_b32 s5, s14
9626; GFX6-NEXT:    s_mov_b32 s6, s13
9627; GFX6-NEXT:    s_mov_b32 s7, s12
9628; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
9629; GFX6-NEXT:    s_waitcnt vmcnt(0)
9630; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
9631; GFX6-NEXT:    s_endpgm
9632;
9633; GFX7-LABEL: global_wavefront_one_as_unordered_load:
9634; GFX7:       ; %bb.0: ; %entry
9635; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9636; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9637; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9638; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9639; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9640; GFX7-NEXT:    flat_load_dword v2, v[0:1]
9641; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9642; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9643; GFX7-NEXT:    s_waitcnt vmcnt(0)
9644; GFX7-NEXT:    flat_store_dword v[0:1], v2
9645; GFX7-NEXT:    s_endpgm
9646;
9647; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load:
9648; GFX10-WGP:       ; %bb.0: ; %entry
9649; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
9650; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9651; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9652; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9653; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
9654; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9655; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
9656; GFX10-WGP-NEXT:    s_endpgm
9657;
9658; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load:
9659; GFX10-CU:       ; %bb.0: ; %entry
9660; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
9661; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9662; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9663; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9664; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
9665; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9666; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
9667; GFX10-CU-NEXT:    s_endpgm
9668;
9669; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load:
9670; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9671; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
9672; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
9673; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9674; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9675; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
9676; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9677; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
9678; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
9679; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9680; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
9681; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9682; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9683; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
9684; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
9685; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
9686; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
9687; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
9688; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
9689; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
9690; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9691; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9692; SKIP-CACHE-INV-NEXT:    s_endpgm
9693;
9694; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
9695; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9696; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9697; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9698; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9699; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9700; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
9701; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9702; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9703; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9704;
9705; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
9706; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9707; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9708; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9709; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9710; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9711; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
9712; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9713; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9714; GFX90A-TGSPLIT-NEXT:    s_endpgm
9715;
9716; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
9717; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9718; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9719; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9720; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9721; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9722; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
9723; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9724; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9725; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9726;
9727; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
9728; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9729; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9730; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9731; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9732; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9733; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
9734; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9735; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9736; GFX940-TGSPLIT-NEXT:    s_endpgm
9737;
9738; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_load:
9739; GFX11-WGP:       ; %bb.0: ; %entry
9740; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
9741; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9742; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9743; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9744; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
9745; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
9746; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9747; GFX11-WGP-NEXT:    s_endpgm
9748;
9749; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load:
9750; GFX11-CU:       ; %bb.0: ; %entry
9751; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
9752; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9753; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9754; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9755; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
9756; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9757; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9758; GFX11-CU-NEXT:    s_endpgm
9759;
9760; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load:
9761; GFX12-WGP:       ; %bb.0: ; %entry
9762; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
9763; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9764; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9765; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9766; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
9767; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
9768; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9769; GFX12-WGP-NEXT:    s_endpgm
9770;
9771; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load:
9772; GFX12-CU:       ; %bb.0: ; %entry
9773; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
9774; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9775; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9776; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9777; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
9778; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9779; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9780; GFX12-CU-NEXT:    s_endpgm
9781    ptr addrspace(1) %in, ptr addrspace(1) %out) {
9782entry:
9783  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4
9784  store i32 %val, ptr addrspace(1) %out
9785  ret void
9786}
9787
9788define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
9789; GFX6-LABEL: global_wavefront_one_as_monotonic_load:
9790; GFX6:       ; %bb.0: ; %entry
9791; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
9792; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
9793; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
9794; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9795; GFX6-NEXT:    s_mov_b32 s6, s9
9796; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
9797; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
9798; GFX6-NEXT:    s_mov_b32 s13, -1
9799; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
9800; GFX6-NEXT:    s_mov_b32 s9, s6
9801; GFX6-NEXT:    s_mov_b32 s10, s13
9802; GFX6-NEXT:    s_mov_b32 s11, s12
9803; GFX6-NEXT:    s_mov_b32 s14, s5
9804; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9805; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9806; GFX6-NEXT:    s_mov_b32 s5, s14
9807; GFX6-NEXT:    s_mov_b32 s6, s13
9808; GFX6-NEXT:    s_mov_b32 s7, s12
9809; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
9810; GFX6-NEXT:    s_waitcnt vmcnt(0)
9811; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
9812; GFX6-NEXT:    s_endpgm
9813;
9814; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
9815; GFX7:       ; %bb.0: ; %entry
9816; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9817; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9818; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
9819; GFX7-NEXT:    v_mov_b32_e32 v0, s6
9820; GFX7-NEXT:    v_mov_b32_e32 v1, s7
9821; GFX7-NEXT:    flat_load_dword v2, v[0:1]
9822; GFX7-NEXT:    v_mov_b32_e32 v0, s4
9823; GFX7-NEXT:    v_mov_b32_e32 v1, s5
9824; GFX7-NEXT:    s_waitcnt vmcnt(0)
9825; GFX7-NEXT:    flat_store_dword v[0:1], v2
9826; GFX7-NEXT:    s_endpgm
9827;
9828; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load:
9829; GFX10-WGP:       ; %bb.0: ; %entry
9830; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
9831; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9832; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9833; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9834; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
9835; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
9836; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
9837; GFX10-WGP-NEXT:    s_endpgm
9838;
9839; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load:
9840; GFX10-CU:       ; %bb.0: ; %entry
9841; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
9842; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9843; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9844; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
9845; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
9846; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
9847; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
9848; GFX10-CU-NEXT:    s_endpgm
9849;
9850; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load:
9851; SKIP-CACHE-INV:       ; %bb.0: ; %entry
9852; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
9853; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
9854; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
9855; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
9856; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
9857; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9858; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
9859; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
9860; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9861; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
9862; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
9863; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
9864; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
9865; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
9866; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
9867; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
9868; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
9869; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
9870; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
9871; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
9872; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9873; SKIP-CACHE-INV-NEXT:    s_endpgm
9874;
9875; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
9876; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
9877; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9878; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9879; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9880; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9881; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
9882; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9883; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9884; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
9885;
9886; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
9887; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
9888; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9889; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9890; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
9891; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9892; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
9893; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9894; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
9895; GFX90A-TGSPLIT-NEXT:    s_endpgm
9896;
9897; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
9898; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
9899; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9900; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9901; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9902; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9903; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
9904; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9905; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9906; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
9907;
9908; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
9909; GFX940-TGSPLIT:       ; %bb.0: ; %entry
9910; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
9911; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
9912; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
9913; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
9914; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
9915; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
9916; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
9917; GFX940-TGSPLIT-NEXT:    s_endpgm
9918;
9919; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_load:
9920; GFX11-WGP:       ; %bb.0: ; %entry
9921; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
9922; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9923; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9924; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
9925; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
9926; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
9927; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9928; GFX11-WGP-NEXT:    s_endpgm
9929;
9930; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load:
9931; GFX11-CU:       ; %bb.0: ; %entry
9932; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
9933; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9934; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9935; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
9936; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
9937; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
9938; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9939; GFX11-CU-NEXT:    s_endpgm
9940;
9941; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load:
9942; GFX12-WGP:       ; %bb.0: ; %entry
9943; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
9944; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9945; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9946; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
9947; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
9948; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
9949; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
9950; GFX12-WGP-NEXT:    s_endpgm
9951;
9952; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load:
9953; GFX12-CU:       ; %bb.0: ; %entry
9954; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
9955; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
9956; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
9957; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
9958; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
9959; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
9960; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
9961; GFX12-CU-NEXT:    s_endpgm
9962    ptr addrspace(1) %in, ptr addrspace(1) %out) {
9963entry:
9964  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4
9965  store i32 %val, ptr addrspace(1) %out
9966  ret void
9967}
9968
9969define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
9970; GFX6-LABEL: global_wavefront_one_as_acquire_load:
9971; GFX6:       ; %bb.0: ; %entry
9972; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
9973; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
9974; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
9975; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9976; GFX6-NEXT:    s_mov_b32 s6, s9
9977; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
9978; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
9979; GFX6-NEXT:    s_mov_b32 s13, -1
9980; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
9981; GFX6-NEXT:    s_mov_b32 s9, s6
9982; GFX6-NEXT:    s_mov_b32 s10, s13
9983; GFX6-NEXT:    s_mov_b32 s11, s12
9984; GFX6-NEXT:    s_mov_b32 s14, s5
9985; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
9986; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
9987; GFX6-NEXT:    s_mov_b32 s5, s14
9988; GFX6-NEXT:    s_mov_b32 s6, s13
9989; GFX6-NEXT:    s_mov_b32 s7, s12
9990; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
9991; GFX6-NEXT:    s_waitcnt vmcnt(0)
9992; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
9993; GFX6-NEXT:    s_endpgm
9994;
9995; GFX7-LABEL: global_wavefront_one_as_acquire_load:
9996; GFX7:       ; %bb.0: ; %entry
9997; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9998; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
9999; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10000; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10001; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10002; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10003; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10004; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10005; GFX7-NEXT:    s_waitcnt vmcnt(0)
10006; GFX7-NEXT:    flat_store_dword v[0:1], v2
10007; GFX7-NEXT:    s_endpgm
10008;
10009; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load:
10010; GFX10-WGP:       ; %bb.0: ; %entry
10011; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10012; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10013; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10014; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10015; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
10016; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10017; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10018; GFX10-WGP-NEXT:    s_endpgm
10019;
10020; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load:
10021; GFX10-CU:       ; %bb.0: ; %entry
10022; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10023; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10024; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10025; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10026; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
10027; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
10028; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10029; GFX10-CU-NEXT:    s_endpgm
10030;
10031; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load:
10032; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10033; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10034; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
10035; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10036; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10037; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
10038; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10039; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
10040; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
10041; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10042; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
10043; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
10044; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10045; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
10046; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10047; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10048; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
10049; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
10050; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
10051; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
10052; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
10053; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10054; SKIP-CACHE-INV-NEXT:    s_endpgm
10055;
10056; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
10057; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10058; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10059; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10060; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10061; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10062; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
10063; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10064; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10065; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10066;
10067; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
10068; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10069; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10070; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10071; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10072; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10073; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
10074; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10075; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10076; GFX90A-TGSPLIT-NEXT:    s_endpgm
10077;
10078; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
10079; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10080; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10081; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10082; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10083; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10084; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
10085; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10086; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10087; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10088;
10089; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
10090; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10091; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10092; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10093; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10094; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10095; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
10096; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10097; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10098; GFX940-TGSPLIT-NEXT:    s_endpgm
10099;
10100; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_load:
10101; GFX11-WGP:       ; %bb.0: ; %entry
10102; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10103; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10104; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10105; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10106; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
10107; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10108; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10109; GFX11-WGP-NEXT:    s_endpgm
10110;
10111; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load:
10112; GFX11-CU:       ; %bb.0: ; %entry
10113; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10114; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10115; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10116; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10117; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
10118; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
10119; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10120; GFX11-CU-NEXT:    s_endpgm
10121;
10122; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load:
10123; GFX12-WGP:       ; %bb.0: ; %entry
10124; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10125; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10126; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10127; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10128; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
10129; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10130; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10131; GFX12-WGP-NEXT:    s_endpgm
10132;
10133; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load:
10134; GFX12-CU:       ; %bb.0: ; %entry
10135; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10136; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10137; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10138; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10139; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
10140; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
10141; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10142; GFX12-CU-NEXT:    s_endpgm
10143    ptr addrspace(1) %in, ptr addrspace(1) %out) {
10144entry:
10145  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4
10146  store i32 %val, ptr addrspace(1) %out
10147  ret void
10148}
10149
10150define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
10151; GFX6-LABEL: global_wavefront_one_as_seq_cst_load:
10152; GFX6:       ; %bb.0: ; %entry
10153; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
10154; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
10155; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
10156; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10157; GFX6-NEXT:    s_mov_b32 s6, s9
10158; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
10159; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
10160; GFX6-NEXT:    s_mov_b32 s13, -1
10161; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
10162; GFX6-NEXT:    s_mov_b32 s9, s6
10163; GFX6-NEXT:    s_mov_b32 s10, s13
10164; GFX6-NEXT:    s_mov_b32 s11, s12
10165; GFX6-NEXT:    s_mov_b32 s14, s5
10166; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10167; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10168; GFX6-NEXT:    s_mov_b32 s5, s14
10169; GFX6-NEXT:    s_mov_b32 s6, s13
10170; GFX6-NEXT:    s_mov_b32 s7, s12
10171; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
10172; GFX6-NEXT:    s_waitcnt vmcnt(0)
10173; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
10174; GFX6-NEXT:    s_endpgm
10175;
10176; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
10177; GFX7:       ; %bb.0: ; %entry
10178; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10179; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
10180; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10181; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10182; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10183; GFX7-NEXT:    flat_load_dword v2, v[0:1]
10184; GFX7-NEXT:    v_mov_b32_e32 v0, s4
10185; GFX7-NEXT:    v_mov_b32_e32 v1, s5
10186; GFX7-NEXT:    s_waitcnt vmcnt(0)
10187; GFX7-NEXT:    flat_store_dword v[0:1], v2
10188; GFX7-NEXT:    s_endpgm
10189;
10190; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
10191; GFX10-WGP:       ; %bb.0: ; %entry
10192; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10193; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10194; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10195; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10196; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
10197; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
10198; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10199; GFX10-WGP-NEXT:    s_endpgm
10200;
10201; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load:
10202; GFX10-CU:       ; %bb.0: ; %entry
10203; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10204; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10205; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10206; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10207; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
10208; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
10209; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10210; GFX10-CU-NEXT:    s_endpgm
10211;
10212; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load:
10213; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10214; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10215; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
10216; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10217; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10218; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
10219; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
10221; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
10222; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10223; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
10224; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
10225; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
10226; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
10227; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10228; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10229; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
10230; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
10231; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
10232; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
10233; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
10234; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10235; SKIP-CACHE-INV-NEXT:    s_endpgm
10236;
10237; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
10238; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10239; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10240; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10241; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10242; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10243; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
10244; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10245; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10246; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10247;
10248; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
10249; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10250; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10251; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10252; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10253; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10254; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
10255; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10256; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10257; GFX90A-TGSPLIT-NEXT:    s_endpgm
10258;
10259; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
10260; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10261; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10262; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10263; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10264; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10265; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
10266; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10267; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10268; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10269;
10270; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
10271; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10272; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10273; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
10274; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10275; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10276; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
10277; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
10278; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10279; GFX940-TGSPLIT-NEXT:    s_endpgm
10280;
10281; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
10282; GFX11-WGP:       ; %bb.0: ; %entry
10283; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10284; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10285; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10286; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10287; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
10288; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
10289; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10290; GFX11-WGP-NEXT:    s_endpgm
10291;
10292; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load:
10293; GFX11-CU:       ; %bb.0: ; %entry
10294; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10295; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10296; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10297; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10298; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
10299; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
10300; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10301; GFX11-CU-NEXT:    s_endpgm
10302;
10303; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
10304; GFX12-WGP:       ; %bb.0: ; %entry
10305; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10306; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10307; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10308; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10309; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
10310; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
10311; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10312; GFX12-WGP-NEXT:    s_endpgm
10313;
10314; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load:
10315; GFX12-CU:       ; %bb.0: ; %entry
10316; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10317; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
10318; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10319; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10320; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
10321; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
10322; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10323; GFX12-CU-NEXT:    s_endpgm
10324    ptr addrspace(1) %in, ptr addrspace(1) %out) {
10325entry:
10326  %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4
10327  store i32 %val, ptr addrspace(1) %out
10328  ret void
10329}
10330
10331define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
10332; GFX6-LABEL: global_wavefront_one_as_unordered_store:
10333; GFX6:       ; %bb.0: ; %entry
10334; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
10335; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
10336; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
10337; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10338; GFX6-NEXT:    s_mov_b32 s11, s5
10339; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10340; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
10341; GFX6-NEXT:    s_mov_b32 s10, -1
10342; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10343; GFX6-NEXT:    s_mov_b32 s5, s11
10344; GFX6-NEXT:    s_mov_b32 s6, s10
10345; GFX6-NEXT:    s_mov_b32 s7, s9
10346; GFX6-NEXT:    v_mov_b32_e32 v0, s8
10347; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
10348; GFX6-NEXT:    s_endpgm
10349;
10350; GFX7-LABEL: global_wavefront_one_as_unordered_store:
10351; GFX7:       ; %bb.0: ; %entry
10352; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10353; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10355; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10356; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10357; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10358; GFX7-NEXT:    flat_store_dword v[0:1], v2
10359; GFX7-NEXT:    s_endpgm
10360;
10361; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store:
10362; GFX10-WGP:       ; %bb.0: ; %entry
10363; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
10364; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10365; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10366; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10367; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
10368; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10369; GFX10-WGP-NEXT:    s_endpgm
10370;
10371; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store:
10372; GFX10-CU:       ; %bb.0: ; %entry
10373; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
10374; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10375; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10376; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10377; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
10378; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10379; GFX10-CU-NEXT:    s_endpgm
10380;
10381; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store:
10382; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10383; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10384; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
10385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10386; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10387; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
10388; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10389; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
10390; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
10391; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10392; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
10393; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
10394; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
10395; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10396; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10397; SKIP-CACHE-INV-NEXT:    s_endpgm
10398;
10399; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
10400; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10401; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10402; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10403; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10404; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10405; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10406; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10407; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10408;
10409; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
10410; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10411; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10412; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10413; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10414; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10415; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10416; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10417; GFX90A-TGSPLIT-NEXT:    s_endpgm
10418;
10419; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
10420; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10421; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10422; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10423; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10424; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10425; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10426; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10427; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10428;
10429; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
10430; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10431; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10432; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10433; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10434; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10435; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10436; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10437; GFX940-TGSPLIT-NEXT:    s_endpgm
10438;
10439; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_store:
10440; GFX11-WGP:       ; %bb.0: ; %entry
10441; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10442; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10443; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10444; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10445; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
10446; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10447; GFX11-WGP-NEXT:    s_endpgm
10448;
10449; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store:
10450; GFX11-CU:       ; %bb.0: ; %entry
10451; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10452; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10453; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10454; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10455; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
10456; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10457; GFX11-CU-NEXT:    s_endpgm
10458;
10459; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store:
10460; GFX12-WGP:       ; %bb.0: ; %entry
10461; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10462; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10463; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10464; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10465; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
10466; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10467; GFX12-WGP-NEXT:    s_endpgm
10468;
10469; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store:
10470; GFX12-CU:       ; %bb.0: ; %entry
10471; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10472; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10473; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10474; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10475; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
10476; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10477; GFX12-CU-NEXT:    s_endpgm
10478    i32 %in, ptr addrspace(1) %out) {
10479entry:
10480  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
10481  ret void
10482}
10483
10484define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
10485; GFX6-LABEL: global_wavefront_one_as_monotonic_store:
10486; GFX6:       ; %bb.0: ; %entry
10487; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
10488; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
10489; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
10490; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10491; GFX6-NEXT:    s_mov_b32 s11, s5
10492; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10493; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
10494; GFX6-NEXT:    s_mov_b32 s10, -1
10495; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10496; GFX6-NEXT:    s_mov_b32 s5, s11
10497; GFX6-NEXT:    s_mov_b32 s6, s10
10498; GFX6-NEXT:    s_mov_b32 s7, s9
10499; GFX6-NEXT:    v_mov_b32_e32 v0, s8
10500; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
10501; GFX6-NEXT:    s_endpgm
10502;
10503; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
10504; GFX7:       ; %bb.0: ; %entry
10505; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10506; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10507; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10508; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10509; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10510; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10511; GFX7-NEXT:    flat_store_dword v[0:1], v2
10512; GFX7-NEXT:    s_endpgm
10513;
10514; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store:
10515; GFX10-WGP:       ; %bb.0: ; %entry
10516; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
10517; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10518; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10519; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10520; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
10521; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10522; GFX10-WGP-NEXT:    s_endpgm
10523;
10524; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store:
10525; GFX10-CU:       ; %bb.0: ; %entry
10526; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
10527; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10528; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10529; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10530; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
10531; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10532; GFX10-CU-NEXT:    s_endpgm
10533;
10534; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store:
10535; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10536; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10537; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
10538; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10539; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10540; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
10541; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10542; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
10543; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
10544; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10545; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
10546; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
10547; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
10548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10549; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10550; SKIP-CACHE-INV-NEXT:    s_endpgm
10551;
10552; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
10553; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10554; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10555; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10556; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10557; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10558; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10559; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10560; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10561;
10562; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
10563; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10564; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10565; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10566; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10567; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10568; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10569; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10570; GFX90A-TGSPLIT-NEXT:    s_endpgm
10571;
10572; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
10573; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10574; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10575; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10576; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10577; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10578; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10579; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10580; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10581;
10582; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
10583; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10584; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10585; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10586; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10587; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10588; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10589; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10590; GFX940-TGSPLIT-NEXT:    s_endpgm
10591;
10592; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_store:
10593; GFX11-WGP:       ; %bb.0: ; %entry
10594; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10595; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10596; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10597; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10598; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
10599; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10600; GFX11-WGP-NEXT:    s_endpgm
10601;
10602; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store:
10603; GFX11-CU:       ; %bb.0: ; %entry
10604; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10605; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10606; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10607; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10608; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
10609; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10610; GFX11-CU-NEXT:    s_endpgm
10611;
10612; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store:
10613; GFX12-WGP:       ; %bb.0: ; %entry
10614; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10615; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10616; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10617; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10618; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
10619; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10620; GFX12-WGP-NEXT:    s_endpgm
10621;
10622; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store:
10623; GFX12-CU:       ; %bb.0: ; %entry
10624; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10625; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10626; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10627; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10628; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
10629; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10630; GFX12-CU-NEXT:    s_endpgm
10631    i32 %in, ptr addrspace(1) %out) {
10632entry:
10633  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
10634  ret void
10635}
10636
10637define amdgpu_kernel void @global_wavefront_one_as_release_store(
10638; GFX6-LABEL: global_wavefront_one_as_release_store:
10639; GFX6:       ; %bb.0: ; %entry
10640; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
10641; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
10642; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
10643; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10644; GFX6-NEXT:    s_mov_b32 s11, s5
10645; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10646; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
10647; GFX6-NEXT:    s_mov_b32 s10, -1
10648; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10649; GFX6-NEXT:    s_mov_b32 s5, s11
10650; GFX6-NEXT:    s_mov_b32 s6, s10
10651; GFX6-NEXT:    s_mov_b32 s7, s9
10652; GFX6-NEXT:    v_mov_b32_e32 v0, s8
10653; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
10654; GFX6-NEXT:    s_endpgm
10655;
10656; GFX7-LABEL: global_wavefront_one_as_release_store:
10657; GFX7:       ; %bb.0: ; %entry
10658; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10659; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10660; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10661; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10662; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10663; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10664; GFX7-NEXT:    flat_store_dword v[0:1], v2
10665; GFX7-NEXT:    s_endpgm
10666;
10667; GFX10-WGP-LABEL: global_wavefront_one_as_release_store:
10668; GFX10-WGP:       ; %bb.0: ; %entry
10669; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
10670; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10671; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10672; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10673; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
10674; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10675; GFX10-WGP-NEXT:    s_endpgm
10676;
10677; GFX10-CU-LABEL: global_wavefront_one_as_release_store:
10678; GFX10-CU:       ; %bb.0: ; %entry
10679; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
10680; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10681; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10682; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10683; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
10684; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10685; GFX10-CU-NEXT:    s_endpgm
10686;
10687; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store:
10688; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10689; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10690; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
10691; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10692; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10693; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
10694; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10695; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
10696; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
10697; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10698; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
10699; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
10700; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
10701; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10702; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10703; SKIP-CACHE-INV-NEXT:    s_endpgm
10704;
10705; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
10706; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10707; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10708; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10709; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10710; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10711; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10712; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10713; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10714;
10715; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
10716; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10717; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10718; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10719; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10720; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10721; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10722; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10723; GFX90A-TGSPLIT-NEXT:    s_endpgm
10724;
10725; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
10726; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10727; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10728; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10729; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10730; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10731; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10732; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10733; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10734;
10735; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
10736; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10737; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10738; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10739; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10740; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10741; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10742; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10743; GFX940-TGSPLIT-NEXT:    s_endpgm
10744;
10745; GFX11-WGP-LABEL: global_wavefront_one_as_release_store:
10746; GFX11-WGP:       ; %bb.0: ; %entry
10747; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10748; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10749; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10750; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10751; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
10752; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10753; GFX11-WGP-NEXT:    s_endpgm
10754;
10755; GFX11-CU-LABEL: global_wavefront_one_as_release_store:
10756; GFX11-CU:       ; %bb.0: ; %entry
10757; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10758; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10759; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10760; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10761; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
10762; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10763; GFX11-CU-NEXT:    s_endpgm
10764;
10765; GFX12-WGP-LABEL: global_wavefront_one_as_release_store:
10766; GFX12-WGP:       ; %bb.0: ; %entry
10767; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10768; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10769; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10770; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10771; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
10772; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10773; GFX12-WGP-NEXT:    s_endpgm
10774;
10775; GFX12-CU-LABEL: global_wavefront_one_as_release_store:
10776; GFX12-CU:       ; %bb.0: ; %entry
10777; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10778; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10779; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10780; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10781; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
10782; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10783; GFX12-CU-NEXT:    s_endpgm
10784    i32 %in, ptr addrspace(1) %out) {
10785entry:
10786  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
10787  ret void
10788}
10789
10790define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
10791; GFX6-LABEL: global_wavefront_one_as_seq_cst_store:
10792; GFX6:       ; %bb.0: ; %entry
10793; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
10794; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
10795; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
10796; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10797; GFX6-NEXT:    s_mov_b32 s11, s5
10798; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10799; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
10800; GFX6-NEXT:    s_mov_b32 s10, -1
10801; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10802; GFX6-NEXT:    s_mov_b32 s5, s11
10803; GFX6-NEXT:    s_mov_b32 s6, s10
10804; GFX6-NEXT:    s_mov_b32 s7, s9
10805; GFX6-NEXT:    v_mov_b32_e32 v0, s8
10806; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
10807; GFX6-NEXT:    s_endpgm
10808;
10809; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
10810; GFX7:       ; %bb.0: ; %entry
10811; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
10812; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
10813; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10814; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10815; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10816; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10817; GFX7-NEXT:    flat_store_dword v[0:1], v2
10818; GFX7-NEXT:    s_endpgm
10819;
10820; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
10821; GFX10-WGP:       ; %bb.0: ; %entry
10822; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
10823; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10824; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10825; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10826; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
10827; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
10828; GFX10-WGP-NEXT:    s_endpgm
10829;
10830; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store:
10831; GFX10-CU:       ; %bb.0: ; %entry
10832; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
10833; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10834; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10835; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10836; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
10837; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
10838; GFX10-CU-NEXT:    s_endpgm
10839;
10840; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store:
10841; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10842; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
10843; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
10844; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
10845; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10846; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
10847; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10848; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
10849; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
10850; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
10851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
10852; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
10853; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
10854; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
10855; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10856; SKIP-CACHE-INV-NEXT:    s_endpgm
10857;
10858; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
10859; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
10860; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10861; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10862; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10863; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10864; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10865; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10866; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
10867;
10868; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
10869; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
10870; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
10871; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
10872; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10873; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10874; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
10875; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
10876; GFX90A-TGSPLIT-NEXT:    s_endpgm
10877;
10878; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
10879; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
10880; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10881; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10882; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10883; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10884; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10885; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10886; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
10887;
10888; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
10889; GFX940-TGSPLIT:       ; %bb.0: ; %entry
10890; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
10891; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
10892; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
10893; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
10894; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
10895; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
10896; GFX940-TGSPLIT-NEXT:    s_endpgm
10897;
10898; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
10899; GFX11-WGP:       ; %bb.0: ; %entry
10900; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10901; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10902; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
10903; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10904; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
10905; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10906; GFX11-WGP-NEXT:    s_endpgm
10907;
10908; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store:
10909; GFX11-CU:       ; %bb.0: ; %entry
10910; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10911; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10912; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
10913; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
10914; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
10915; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10916; GFX11-CU-NEXT:    s_endpgm
10917;
10918; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
10919; GFX12-WGP:       ; %bb.0: ; %entry
10920; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
10921; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10922; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
10923; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
10924; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
10925; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
10926; GFX12-WGP-NEXT:    s_endpgm
10927;
10928; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store:
10929; GFX12-CU:       ; %bb.0: ; %entry
10930; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
10931; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
10932; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
10933; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
10934; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
10935; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
10936; GFX12-CU-NEXT:    s_endpgm
10937    i32 %in, ptr addrspace(1) %out) {
10938entry:
10939  store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
10940  ret void
10941}
10942
10943define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
10944; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
10945; GFX6:       ; %bb.0: ; %entry
10946; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10947; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
10948; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10949; GFX6-NEXT:    s_mov_b32 s11, s5
10950; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
10951; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
10952; GFX6-NEXT:    s_mov_b32 s10, -1
10953; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
10954; GFX6-NEXT:    s_mov_b32 s5, s11
10955; GFX6-NEXT:    s_mov_b32 s6, s10
10956; GFX6-NEXT:    s_mov_b32 s7, s9
10957; GFX6-NEXT:    v_mov_b32_e32 v0, s8
10958; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
10959; GFX6-NEXT:    s_endpgm
10960;
10961; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
10962; GFX7:       ; %bb.0: ; %entry
10963; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
10964; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
10965; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
10966; GFX7-NEXT:    v_mov_b32_e32 v0, s6
10967; GFX7-NEXT:    v_mov_b32_e32 v1, s7
10968; GFX7-NEXT:    v_mov_b32_e32 v2, s4
10969; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
10970; GFX7-NEXT:    s_endpgm
10971;
10972; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
10973; GFX10-WGP:       ; %bb.0: ; %entry
10974; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
10975; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10976; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
10977; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
10978; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
10979; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
10980; GFX10-WGP-NEXT:    s_endpgm
10981;
10982; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
10983; GFX10-CU:       ; %bb.0: ; %entry
10984; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
10985; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
10986; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
10987; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
10988; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
10989; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
10990; GFX10-CU-NEXT:    s_endpgm
10991;
10992; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
10993; SKIP-CACHE-INV:       ; %bb.0: ; %entry
10994; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
10995; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
10996; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
10997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
10998; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
10999; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11000; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11001; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11002; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11003; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11006; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
11007; SKIP-CACHE-INV-NEXT:    s_endpgm
11008;
11009; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11010; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11011; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11012; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11013; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11014; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11015; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11016; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11017; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11018;
11019; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11020; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11021; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11022; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11023; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11024; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11025; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11026; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11027; GFX90A-TGSPLIT-NEXT:    s_endpgm
11028;
11029; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11030; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11031; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11032; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11033; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11034; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11035; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11036; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11037; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11038;
11039; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11040; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11041; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11042; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11043; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11044; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11045; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11046; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11047; GFX940-TGSPLIT-NEXT:    s_endpgm
11048;
11049; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11050; GFX11-WGP:       ; %bb.0: ; %entry
11051; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11052; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11053; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11054; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11055; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11056; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11057; GFX11-WGP-NEXT:    s_endpgm
11058;
11059; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11060; GFX11-CU:       ; %bb.0: ; %entry
11061; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11062; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11063; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11064; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11065; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11066; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11067; GFX11-CU-NEXT:    s_endpgm
11068;
11069; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11070; GFX12-WGP:       ; %bb.0: ; %entry
11071; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11072; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11073; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11074; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11075; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11076; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11077; GFX12-WGP-NEXT:    s_endpgm
11078;
11079; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
11080; GFX12-CU:       ; %bb.0: ; %entry
11081; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11082; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11083; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11084; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11085; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11086; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11087; GFX12-CU-NEXT:    s_endpgm
11088    ptr addrspace(1) %out, i32 %in) {
11089entry:
11090  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
11091  ret void
11092}
11093
11094define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
11095; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11096; GFX6:       ; %bb.0: ; %entry
11097; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11098; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11099; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11100; GFX6-NEXT:    s_mov_b32 s11, s5
11101; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11102; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11103; GFX6-NEXT:    s_mov_b32 s10, -1
11104; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11105; GFX6-NEXT:    s_mov_b32 s5, s11
11106; GFX6-NEXT:    s_mov_b32 s6, s10
11107; GFX6-NEXT:    s_mov_b32 s7, s9
11108; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11109; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
11110; GFX6-NEXT:    s_endpgm
11111;
11112; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11113; GFX7:       ; %bb.0: ; %entry
11114; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11115; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11116; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11117; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11118; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11119; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11120; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11121; GFX7-NEXT:    s_endpgm
11122;
11123; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11124; GFX10-WGP:       ; %bb.0: ; %entry
11125; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11126; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11127; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11128; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11129; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11130; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
11131; GFX10-WGP-NEXT:    s_endpgm
11132;
11133; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11134; GFX10-CU:       ; %bb.0: ; %entry
11135; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11136; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11137; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11138; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11139; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11140; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
11141; GFX10-CU-NEXT:    s_endpgm
11142;
11143; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11144; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11145; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11146; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11147; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11148; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11149; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11150; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11151; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11152; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11153; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11154; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11155; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11156; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11157; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
11158; SKIP-CACHE-INV-NEXT:    s_endpgm
11159;
11160; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11161; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11162; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11163; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11164; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11165; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11166; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11167; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11168; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11169;
11170; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11171; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11172; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11173; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11174; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11175; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11176; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11177; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11178; GFX90A-TGSPLIT-NEXT:    s_endpgm
11179;
11180; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11181; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11182; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11183; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11184; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11185; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11186; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11187; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11188; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11189;
11190; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11191; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11192; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11193; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11194; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11195; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11196; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11197; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11198; GFX940-TGSPLIT-NEXT:    s_endpgm
11199;
11200; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11201; GFX11-WGP:       ; %bb.0: ; %entry
11202; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11203; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11204; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11205; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11206; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11207; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11208; GFX11-WGP-NEXT:    s_endpgm
11209;
11210; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11211; GFX11-CU:       ; %bb.0: ; %entry
11212; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11213; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11214; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11215; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11216; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11217; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11218; GFX11-CU-NEXT:    s_endpgm
11219;
11220; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11221; GFX12-WGP:       ; %bb.0: ; %entry
11222; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11223; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11224; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11225; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11226; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11227; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11228; GFX12-WGP-NEXT:    s_endpgm
11229;
11230; GFX12-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
11231; GFX12-CU:       ; %bb.0: ; %entry
11232; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11233; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11234; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11235; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11236; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11237; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11238; GFX12-CU-NEXT:    s_endpgm
11239    ptr addrspace(1) %out, i32 %in) {
11240entry:
11241  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
11242  ret void
11243}
11244
11245define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
11246; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw:
11247; GFX6:       ; %bb.0: ; %entry
11248; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11249; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11250; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11251; GFX6-NEXT:    s_mov_b32 s11, s5
11252; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11253; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11254; GFX6-NEXT:    s_mov_b32 s10, -1
11255; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11256; GFX6-NEXT:    s_mov_b32 s5, s11
11257; GFX6-NEXT:    s_mov_b32 s6, s10
11258; GFX6-NEXT:    s_mov_b32 s7, s9
11259; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11260; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
11261; GFX6-NEXT:    s_endpgm
11262;
11263; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
11264; GFX7:       ; %bb.0: ; %entry
11265; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11266; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11267; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11268; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11269; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11270; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11271; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11272; GFX7-NEXT:    s_endpgm
11273;
11274; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
11275; GFX10-WGP:       ; %bb.0: ; %entry
11276; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11277; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11278; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11279; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11280; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11281; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
11282; GFX10-WGP-NEXT:    s_endpgm
11283;
11284; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
11285; GFX10-CU:       ; %bb.0: ; %entry
11286; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11287; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11288; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11289; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11290; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11291; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
11292; GFX10-CU-NEXT:    s_endpgm
11293;
11294; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw:
11295; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11296; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11297; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11298; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11299; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11300; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11302; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11303; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11304; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11305; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11306; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11308; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
11309; SKIP-CACHE-INV-NEXT:    s_endpgm
11310;
11311; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
11312; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11313; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11314; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11316; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11317; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11318; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11319; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11320;
11321; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
11322; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11323; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11324; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11325; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11326; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11327; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11328; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11329; GFX90A-TGSPLIT-NEXT:    s_endpgm
11330;
11331; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
11332; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11333; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11334; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11335; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11336; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11337; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11338; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11339; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11340;
11341; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
11342; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11343; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11344; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11345; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11346; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11347; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11348; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11349; GFX940-TGSPLIT-NEXT:    s_endpgm
11350;
11351; GFX11-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
11352; GFX11-WGP:       ; %bb.0: ; %entry
11353; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11354; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11355; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11356; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11357; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11358; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11359; GFX11-WGP-NEXT:    s_endpgm
11360;
11361; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
11362; GFX11-CU:       ; %bb.0: ; %entry
11363; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11364; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11365; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11366; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11367; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11368; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11369; GFX11-CU-NEXT:    s_endpgm
11370;
11371; GFX12-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
11372; GFX12-WGP:       ; %bb.0: ; %entry
11373; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11374; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11375; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11376; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11377; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11378; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11379; GFX12-WGP-NEXT:    s_endpgm
11380;
11381; GFX12-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
11382; GFX12-CU:       ; %bb.0: ; %entry
11383; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11384; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11385; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11386; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11387; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11388; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11389; GFX12-CU-NEXT:    s_endpgm
11390    ptr addrspace(1) %out, i32 %in) {
11391entry:
11392  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
11393  ret void
11394}
11395
11396define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
11397; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11398; GFX6:       ; %bb.0: ; %entry
11399; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11400; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11401; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11402; GFX6-NEXT:    s_mov_b32 s11, s5
11403; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11404; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11405; GFX6-NEXT:    s_mov_b32 s10, -1
11406; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11407; GFX6-NEXT:    s_mov_b32 s5, s11
11408; GFX6-NEXT:    s_mov_b32 s6, s10
11409; GFX6-NEXT:    s_mov_b32 s7, s9
11410; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11411; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
11412; GFX6-NEXT:    s_endpgm
11413;
11414; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11415; GFX7:       ; %bb.0: ; %entry
11416; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11417; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11418; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11419; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11420; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11421; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11422; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11423; GFX7-NEXT:    s_endpgm
11424;
11425; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11426; GFX10-WGP:       ; %bb.0: ; %entry
11427; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11428; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11429; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11430; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11431; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11432; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
11433; GFX10-WGP-NEXT:    s_endpgm
11434;
11435; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11436; GFX10-CU:       ; %bb.0: ; %entry
11437; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11438; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11439; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11440; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11441; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11442; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
11443; GFX10-CU-NEXT:    s_endpgm
11444;
11445; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11446; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11447; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11448; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11449; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11450; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11451; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11452; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11453; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11454; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11455; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11456; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11457; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11458; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11459; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
11460; SKIP-CACHE-INV-NEXT:    s_endpgm
11461;
11462; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11463; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11464; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11465; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11466; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11467; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11468; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11469; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11470; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11471;
11472; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11473; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11474; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11475; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11476; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11477; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11478; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11479; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11480; GFX90A-TGSPLIT-NEXT:    s_endpgm
11481;
11482; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11483; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11484; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11485; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11486; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11487; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11488; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11489; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11490; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11491;
11492; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11493; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11494; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11495; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11496; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11497; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11498; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11499; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11500; GFX940-TGSPLIT-NEXT:    s_endpgm
11501;
11502; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11503; GFX11-WGP:       ; %bb.0: ; %entry
11504; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11505; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11506; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11507; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11508; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11509; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11510; GFX11-WGP-NEXT:    s_endpgm
11511;
11512; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11513; GFX11-CU:       ; %bb.0: ; %entry
11514; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11515; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11516; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11517; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11518; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11519; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11520; GFX11-CU-NEXT:    s_endpgm
11521;
11522; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11523; GFX12-WGP:       ; %bb.0: ; %entry
11524; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11525; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11526; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11527; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11528; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11529; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11530; GFX12-WGP-NEXT:    s_endpgm
11531;
11532; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
11533; GFX12-CU:       ; %bb.0: ; %entry
11534; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11535; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11536; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11537; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11538; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11539; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11540; GFX12-CU-NEXT:    s_endpgm
11541    ptr addrspace(1) %out, i32 %in) {
11542entry:
11543  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
11544  ret void
11545}
11546
11547define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
11548; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11549; GFX6:       ; %bb.0: ; %entry
11550; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11551; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11552; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11553; GFX6-NEXT:    s_mov_b32 s11, s5
11554; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11555; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11556; GFX6-NEXT:    s_mov_b32 s10, -1
11557; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11558; GFX6-NEXT:    s_mov_b32 s5, s11
11559; GFX6-NEXT:    s_mov_b32 s6, s10
11560; GFX6-NEXT:    s_mov_b32 s7, s9
11561; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11562; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
11563; GFX6-NEXT:    s_endpgm
11564;
11565; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11566; GFX7:       ; %bb.0: ; %entry
11567; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
11568; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
11569; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11570; GFX7-NEXT:    v_mov_b32_e32 v0, s6
11571; GFX7-NEXT:    v_mov_b32_e32 v1, s7
11572; GFX7-NEXT:    v_mov_b32_e32 v2, s4
11573; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
11574; GFX7-NEXT:    s_endpgm
11575;
11576; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11577; GFX10-WGP:       ; %bb.0: ; %entry
11578; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11579; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11580; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11581; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11582; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11583; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
11584; GFX10-WGP-NEXT:    s_endpgm
11585;
11586; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11587; GFX10-CU:       ; %bb.0: ; %entry
11588; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11589; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11590; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11591; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11592; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11593; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
11594; GFX10-CU-NEXT:    s_endpgm
11595;
11596; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11597; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11598; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11599; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11600; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11601; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11602; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11603; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11604; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11605; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11606; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11607; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11608; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11609; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11610; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
11611; SKIP-CACHE-INV-NEXT:    s_endpgm
11612;
11613; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11614; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11615; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11616; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11617; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11618; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11619; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11620; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11621; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11622;
11623; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11624; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11625; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11626; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11627; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11628; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11629; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11630; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
11631; GFX90A-TGSPLIT-NEXT:    s_endpgm
11632;
11633; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11634; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11635; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11636; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11637; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11638; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11639; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11640; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11641; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11642;
11643; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11644; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11645; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11646; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11647; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11648; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11649; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11650; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
11651; GFX940-TGSPLIT-NEXT:    s_endpgm
11652;
11653; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11654; GFX11-WGP:       ; %bb.0: ; %entry
11655; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11656; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11657; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11658; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11659; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11660; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11661; GFX11-WGP-NEXT:    s_endpgm
11662;
11663; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11664; GFX11-CU:       ; %bb.0: ; %entry
11665; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11666; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11667; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11668; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11669; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11670; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11671; GFX11-CU-NEXT:    s_endpgm
11672;
11673; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11674; GFX12-WGP:       ; %bb.0: ; %entry
11675; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11676; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11677; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11678; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11679; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11680; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11681; GFX12-WGP-NEXT:    s_endpgm
11682;
11683; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
11684; GFX12-CU:       ; %bb.0: ; %entry
11685; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11686; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11687; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11688; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11689; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11690; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
11691; GFX12-CU-NEXT:    s_endpgm
11692    ptr addrspace(1) %out, i32 %in) {
11693entry:
11694  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
11695  ret void
11696}
11697
11698define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
11699; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11700; GFX6:       ; %bb.0: ; %entry
11701; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11702; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11703; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11704; GFX6-NEXT:    s_mov_b32 s11, s5
11705; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11706; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11707; GFX6-NEXT:    s_mov_b32 s10, -1
11708; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11709; GFX6-NEXT:    s_mov_b32 s5, s11
11710; GFX6-NEXT:    s_mov_b32 s6, s10
11711; GFX6-NEXT:    s_mov_b32 s7, s9
11712; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11713; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
11714; GFX6-NEXT:    s_waitcnt vmcnt(0)
11715; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
11716; GFX6-NEXT:    s_endpgm
11717;
11718; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11719; GFX7:       ; %bb.0: ; %entry
11720; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11721; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
11722; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11723; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11724; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11725; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11726; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11727; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11728; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11729; GFX7-NEXT:    s_waitcnt vmcnt(0)
11730; GFX7-NEXT:    flat_store_dword v[0:1], v2
11731; GFX7-NEXT:    s_endpgm
11732;
11733; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11734; GFX10-WGP:       ; %bb.0: ; %entry
11735; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11736; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11737; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11738; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11739; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11740; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11741; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11742; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
11743; GFX10-WGP-NEXT:    s_endpgm
11744;
11745; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11746; GFX10-CU:       ; %bb.0: ; %entry
11747; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11748; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11749; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11750; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11751; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11752; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11753; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
11754; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
11755; GFX10-CU-NEXT:    s_endpgm
11756;
11757; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11758; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11759; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11760; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11761; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11762; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11763; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11764; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11765; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11766; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11767; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11768; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11769; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11770; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11771; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
11772; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
11773; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
11774; SKIP-CACHE-INV-NEXT:    s_endpgm
11775;
11776; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11777; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11778; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11779; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11780; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11781; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11782; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11783; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11784; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11785; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
11786; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11787;
11788; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11789; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11790; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11791; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11792; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11793; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11794; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11795; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11796; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11797; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
11798; GFX90A-TGSPLIT-NEXT:    s_endpgm
11799;
11800; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11801; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11802; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11803; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11804; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11805; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11806; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11807; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
11808; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11809; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
11810; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11811;
11812; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11813; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11814; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11815; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11816; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11817; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11819; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
11820; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11821; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
11822; GFX940-TGSPLIT-NEXT:    s_endpgm
11823;
11824; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11825; GFX11-WGP:       ; %bb.0: ; %entry
11826; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
11827; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11828; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11829; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11830; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
11831; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
11832; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
11833; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
11834; GFX11-WGP-NEXT:    s_endpgm
11835;
11836; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11837; GFX11-CU:       ; %bb.0: ; %entry
11838; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
11839; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11840; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11841; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
11842; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
11843; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
11844; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
11845; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
11846; GFX11-CU-NEXT:    s_endpgm
11847;
11848; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11849; GFX12-WGP:       ; %bb.0: ; %entry
11850; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
11851; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11852; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
11853; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
11854; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
11855; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
11856; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
11857; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
11858; GFX12-WGP-NEXT:    s_endpgm
11859;
11860; GFX12-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
11861; GFX12-CU:       ; %bb.0: ; %entry
11862; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
11863; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
11864; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
11865; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
11866; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
11867; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
11868; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
11869; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
11870; GFX12-CU-NEXT:    s_endpgm
11871    ptr addrspace(1) %out, i32 %in) {
11872entry:
11873  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
11874  store i32 %val, ptr addrspace(1) %out, align 4
11875  ret void
11876}
11877
11878define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
11879; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11880; GFX6:       ; %bb.0: ; %entry
11881; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11882; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
11883; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11884; GFX6-NEXT:    s_mov_b32 s11, s5
11885; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
11886; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
11887; GFX6-NEXT:    s_mov_b32 s10, -1
11888; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
11889; GFX6-NEXT:    s_mov_b32 s5, s11
11890; GFX6-NEXT:    s_mov_b32 s6, s10
11891; GFX6-NEXT:    s_mov_b32 s7, s9
11892; GFX6-NEXT:    v_mov_b32_e32 v0, s8
11893; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
11894; GFX6-NEXT:    s_waitcnt vmcnt(0)
11895; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
11896; GFX6-NEXT:    s_endpgm
11897;
11898; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11899; GFX7:       ; %bb.0: ; %entry
11900; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11901; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
11902; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
11903; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11904; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11905; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11906; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
11907; GFX7-NEXT:    v_mov_b32_e32 v0, s4
11908; GFX7-NEXT:    v_mov_b32_e32 v1, s5
11909; GFX7-NEXT:    s_waitcnt vmcnt(0)
11910; GFX7-NEXT:    flat_store_dword v[0:1], v2
11911; GFX7-NEXT:    s_endpgm
11912;
11913; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11914; GFX10-WGP:       ; %bb.0: ; %entry
11915; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
11916; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11917; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
11918; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
11919; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
11920; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11921; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
11922; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
11923; GFX10-WGP-NEXT:    s_endpgm
11924;
11925; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11926; GFX10-CU:       ; %bb.0: ; %entry
11927; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
11928; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11929; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
11930; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
11931; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
11932; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11933; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
11934; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
11935; GFX10-CU-NEXT:    s_endpgm
11936;
11937; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11938; SKIP-CACHE-INV:       ; %bb.0: ; %entry
11939; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11940; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
11941; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
11942; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
11943; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
11944; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
11945; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
11946; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
11947; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
11948; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
11949; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
11950; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
11951; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
11952; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
11953; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
11954; SKIP-CACHE-INV-NEXT:    s_endpgm
11955;
11956; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11957; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
11958; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11959; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11960; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11961; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11962; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11963; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11964; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11965; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
11966; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
11967;
11968; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11969; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
11970; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11971; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
11972; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
11973; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11974; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
11975; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
11976; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11977; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
11978; GFX90A-TGSPLIT-NEXT:    s_endpgm
11979;
11980; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11981; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
11982; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11983; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11984; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11985; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11986; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11987; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
11988; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
11989; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
11990; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
11991;
11992; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
11993; GFX940-TGSPLIT:       ; %bb.0: ; %entry
11994; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
11995; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
11996; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
11997; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
11998; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
11999; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
12000; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12001; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
12002; GFX940-TGSPLIT-NEXT:    s_endpgm
12003;
12004; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
12005; GFX11-WGP:       ; %bb.0: ; %entry
12006; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
12007; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12008; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12009; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12010; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
12011; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
12012; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12013; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
12014; GFX11-WGP-NEXT:    s_endpgm
12015;
12016; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
12017; GFX11-CU:       ; %bb.0: ; %entry
12018; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
12019; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12020; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12021; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12022; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
12023; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
12024; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12025; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
12026; GFX11-CU-NEXT:    s_endpgm
12027;
12028; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
12029; GFX12-WGP:       ; %bb.0: ; %entry
12030; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
12031; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12032; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12033; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12034; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
12035; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
12036; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12037; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
12038; GFX12-WGP-NEXT:    s_endpgm
12039;
12040; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
12041; GFX12-CU:       ; %bb.0: ; %entry
12042; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
12043; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12044; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12045; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12046; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
12047; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
12048; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12049; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
12050; GFX12-CU-NEXT:    s_endpgm
12051    ptr addrspace(1) %out, i32 %in) {
12052entry:
12053  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
12054  store i32 %val, ptr addrspace(1) %out, align 4
12055  ret void
12056}
12057
12058define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
12059; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12060; GFX6:       ; %bb.0: ; %entry
12061; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12062; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
12063; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12064; GFX6-NEXT:    s_mov_b32 s11, s5
12065; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
12066; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
12067; GFX6-NEXT:    s_mov_b32 s10, -1
12068; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
12069; GFX6-NEXT:    s_mov_b32 s5, s11
12070; GFX6-NEXT:    s_mov_b32 s6, s10
12071; GFX6-NEXT:    s_mov_b32 s7, s9
12072; GFX6-NEXT:    v_mov_b32_e32 v0, s8
12073; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
12074; GFX6-NEXT:    s_waitcnt vmcnt(0)
12075; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
12076; GFX6-NEXT:    s_endpgm
12077;
12078; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12079; GFX7:       ; %bb.0: ; %entry
12080; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12081; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
12082; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12083; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12084; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12085; GFX7-NEXT:    v_mov_b32_e32 v2, s6
12086; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
12087; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12088; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12089; GFX7-NEXT:    s_waitcnt vmcnt(0)
12090; GFX7-NEXT:    flat_store_dword v[0:1], v2
12091; GFX7-NEXT:    s_endpgm
12092;
12093; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12094; GFX10-WGP:       ; %bb.0: ; %entry
12095; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
12096; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12097; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
12098; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12099; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
12100; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
12101; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
12102; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
12103; GFX10-WGP-NEXT:    s_endpgm
12104;
12105; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12106; GFX10-CU:       ; %bb.0: ; %entry
12107; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
12108; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12109; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
12110; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12111; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
12112; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
12113; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
12114; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
12115; GFX10-CU-NEXT:    s_endpgm
12116;
12117; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12118; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12119; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12120; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
12121; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12122; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
12123; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
12124; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
12125; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
12126; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
12127; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
12128; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
12129; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
12130; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
12131; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
12132; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
12133; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12134; SKIP-CACHE-INV-NEXT:    s_endpgm
12135;
12136; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12137; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12138; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12139; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12140; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12141; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12142; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12143; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
12144; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12145; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
12146; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12147;
12148; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12149; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12150; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12151; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12152; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
12153; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12154; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12155; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
12156; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12157; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
12158; GFX90A-TGSPLIT-NEXT:    s_endpgm
12159;
12160; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12161; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12162; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12163; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12164; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12165; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12166; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12167; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
12168; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12169; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
12170; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12171;
12172; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12173; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12174; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12175; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12176; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
12177; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12178; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12179; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
12180; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
12181; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
12182; GFX940-TGSPLIT-NEXT:    s_endpgm
12183;
12184; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12185; GFX11-WGP:       ; %bb.0: ; %entry
12186; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
12187; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12188; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12189; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12190; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
12191; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
12192; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
12193; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
12194; GFX11-WGP-NEXT:    s_endpgm
12195;
12196; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12197; GFX11-CU:       ; %bb.0: ; %entry
12198; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
12199; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12200; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12201; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12202; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
12203; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
12204; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
12205; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
12206; GFX11-CU-NEXT:    s_endpgm
12207;
12208; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12209; GFX12-WGP:       ; %bb.0: ; %entry
12210; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
12211; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12212; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
12213; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12214; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
12215; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
12216; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
12217; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
12218; GFX12-WGP-NEXT:    s_endpgm
12219;
12220; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
12221; GFX12-CU:       ; %bb.0: ; %entry
12222; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
12223; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12224; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
12225; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12226; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
12227; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
12228; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
12229; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
12230; GFX12-CU-NEXT:    s_endpgm
12231    ptr addrspace(1) %out, i32 %in) {
12232entry:
12233  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
12234  store i32 %val, ptr addrspace(1) %out, align 4
12235  ret void
12236}
12237
12238define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
12239; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12240; GFX6:       ; %bb.0: ; %entry
12241; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
12242; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
12243; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
12244; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
12245; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12246; GFX6-NEXT:    s_mov_b32 s12, s5
12247; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
12248; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
12249; GFX6-NEXT:    s_mov_b32 s11, -1
12250; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
12251; GFX6-NEXT:    s_mov_b32 s5, s12
12252; GFX6-NEXT:    s_mov_b32 s6, s11
12253; GFX6-NEXT:    s_mov_b32 s7, s10
12254; GFX6-NEXT:    v_mov_b32_e32 v0, s9
12255; GFX6-NEXT:    v_mov_b32_e32 v2, s8
12256; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12257; GFX6-NEXT:    v_mov_b32_e32 v1, v2
12258; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
12259; GFX6-NEXT:    s_endpgm
12260;
12261; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12262; GFX7:       ; %bb.0: ; %entry
12263; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12264; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12265; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12266; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12267; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12268; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12269; GFX7-NEXT:    s_mov_b32 s4, s8
12270; GFX7-NEXT:    s_mov_b32 s5, s9
12271; GFX7-NEXT:    s_mov_b32 s9, s10
12272; GFX7-NEXT:    s_mov_b32 s8, s11
12273; GFX7-NEXT:    s_add_u32 s4, s4, s9
12274; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12275; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12276; GFX7-NEXT:    s_mov_b32 s5, s8
12277; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12278; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12279; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12280; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12281; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12282; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12283; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12284; GFX7-NEXT:    s_endpgm
12285;
12286; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12287; GFX10-WGP:       ; %bb.0: ; %entry
12288; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
12289; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12290; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
12291; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
12292; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12293; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12294; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
12295; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12296; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
12297; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12298; GFX10-WGP-NEXT:    s_endpgm
12299;
12300; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12301; GFX10-CU:       ; %bb.0: ; %entry
12302; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
12303; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12304; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
12305; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
12306; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12307; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12308; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
12309; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12310; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
12311; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12312; GFX10-CU-NEXT:    s_endpgm
12313;
12314; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12315; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12316; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
12317; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
12318; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
12319; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
12320; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12321; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
12322; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
12323; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
12324; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
12325; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
12326; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
12327; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
12328; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
12329; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
12330; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
12331; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
12333; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
12334; SKIP-CACHE-INV-NEXT:    s_endpgm
12335;
12336; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12337; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12338; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12340; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12341; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12342; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12343; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12345; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12346; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12347; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12348; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12349;
12350; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12351; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12352; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12354; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12355; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12356; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12358; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12359; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12360; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12361; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12362; GFX90A-TGSPLIT-NEXT:    s_endpgm
12363;
12364; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12365; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12366; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12367; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12368; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12369; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12370; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12371; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12372; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12373; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12374; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12375; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12376; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12377;
12378; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12379; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12380; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12381; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12382; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12383; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12384; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12385; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12386; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12387; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12388; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12389; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12390; GFX940-TGSPLIT-NEXT:    s_endpgm
12391;
12392; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12393; GFX11-WGP:       ; %bb.0: ; %entry
12394; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
12395; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12396; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12397; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12398; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12399; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12400; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
12401; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12402; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
12403; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12404; GFX11-WGP-NEXT:    s_endpgm
12405;
12406; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12407; GFX11-CU:       ; %bb.0: ; %entry
12408; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
12409; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12410; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12411; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12412; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12413; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12414; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
12415; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12416; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
12417; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12418; GFX11-CU-NEXT:    s_endpgm
12419;
12420; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12421; GFX12-WGP:       ; %bb.0: ; %entry
12422; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
12423; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12424; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12425; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12426; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12427; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12428; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
12429; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12430; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
12431; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12432; GFX12-WGP-NEXT:    s_endpgm
12433;
12434; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
12435; GFX12-CU:       ; %bb.0: ; %entry
12436; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
12437; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12438; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12439; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12440; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12441; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12442; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
12443; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12444; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
12445; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12446; GFX12-CU-NEXT:    s_endpgm
12447    ptr addrspace(1) %out, i32 %in, i32 %old) {
12448entry:
12449  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
12450  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
12451  ret void
12452}
12453
12454define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
12455; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12456; GFX6:       ; %bb.0: ; %entry
12457; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
12458; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
12459; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
12460; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
12461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12462; GFX6-NEXT:    s_mov_b32 s12, s5
12463; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
12464; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
12465; GFX6-NEXT:    s_mov_b32 s11, -1
12466; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
12467; GFX6-NEXT:    s_mov_b32 s5, s12
12468; GFX6-NEXT:    s_mov_b32 s6, s11
12469; GFX6-NEXT:    s_mov_b32 s7, s10
12470; GFX6-NEXT:    v_mov_b32_e32 v0, s9
12471; GFX6-NEXT:    v_mov_b32_e32 v2, s8
12472; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12473; GFX6-NEXT:    v_mov_b32_e32 v1, v2
12474; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
12475; GFX6-NEXT:    s_endpgm
12476;
12477; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12478; GFX7:       ; %bb.0: ; %entry
12479; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12480; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12481; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12482; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12483; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12484; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12485; GFX7-NEXT:    s_mov_b32 s4, s8
12486; GFX7-NEXT:    s_mov_b32 s5, s9
12487; GFX7-NEXT:    s_mov_b32 s9, s10
12488; GFX7-NEXT:    s_mov_b32 s8, s11
12489; GFX7-NEXT:    s_add_u32 s4, s4, s9
12490; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12491; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12492; GFX7-NEXT:    s_mov_b32 s5, s8
12493; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12494; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12495; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12496; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12497; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12498; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12499; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12500; GFX7-NEXT:    s_endpgm
12501;
12502; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12503; GFX10-WGP:       ; %bb.0: ; %entry
12504; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
12505; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12506; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
12507; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
12508; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12509; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12510; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
12511; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12512; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
12513; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12514; GFX10-WGP-NEXT:    s_endpgm
12515;
12516; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12517; GFX10-CU:       ; %bb.0: ; %entry
12518; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
12519; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12520; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
12521; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
12522; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12523; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12524; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
12525; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12526; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
12527; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12528; GFX10-CU-NEXT:    s_endpgm
12529;
12530; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12531; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12532; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
12533; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
12534; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
12535; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
12536; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12537; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
12538; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
12539; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
12540; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
12541; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
12542; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
12543; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
12544; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
12545; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
12546; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
12547; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
12549; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
12550; SKIP-CACHE-INV-NEXT:    s_endpgm
12551;
12552; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12553; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12554; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12555; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12556; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12557; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12558; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12559; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12560; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12561; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12562; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12563; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12564; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12565;
12566; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12567; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12568; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12569; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12570; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12571; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12572; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12573; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12574; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12575; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12576; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12577; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12578; GFX90A-TGSPLIT-NEXT:    s_endpgm
12579;
12580; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12581; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12582; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12583; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12584; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12585; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12586; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12587; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12588; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12589; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12590; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12591; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12592; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12593;
12594; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12595; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12596; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12597; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12598; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12599; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12600; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12601; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12602; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12603; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12604; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12605; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12606; GFX940-TGSPLIT-NEXT:    s_endpgm
12607;
12608; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12609; GFX11-WGP:       ; %bb.0: ; %entry
12610; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
12611; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12612; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12613; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12614; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12615; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12616; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
12617; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12618; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
12619; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12620; GFX11-WGP-NEXT:    s_endpgm
12621;
12622; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12623; GFX11-CU:       ; %bb.0: ; %entry
12624; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
12625; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12626; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12627; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12628; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12629; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12630; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
12631; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12632; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
12633; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12634; GFX11-CU-NEXT:    s_endpgm
12635;
12636; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12637; GFX12-WGP:       ; %bb.0: ; %entry
12638; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
12639; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12640; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12641; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12642; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12643; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12644; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
12645; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12646; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
12647; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12648; GFX12-WGP-NEXT:    s_endpgm
12649;
12650; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
12651; GFX12-CU:       ; %bb.0: ; %entry
12652; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
12653; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12654; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12655; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12656; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12657; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12658; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
12659; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12660; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
12661; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12662; GFX12-CU-NEXT:    s_endpgm
12663    ptr addrspace(1) %out, i32 %in, i32 %old) {
12664entry:
12665  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
12666  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
12667  ret void
12668}
12669
12670define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
12671; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12672; GFX6:       ; %bb.0: ; %entry
12673; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
12674; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
12675; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
12676; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
12677; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12678; GFX6-NEXT:    s_mov_b32 s12, s5
12679; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
12680; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
12681; GFX6-NEXT:    s_mov_b32 s11, -1
12682; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
12683; GFX6-NEXT:    s_mov_b32 s5, s12
12684; GFX6-NEXT:    s_mov_b32 s6, s11
12685; GFX6-NEXT:    s_mov_b32 s7, s10
12686; GFX6-NEXT:    v_mov_b32_e32 v0, s9
12687; GFX6-NEXT:    v_mov_b32_e32 v2, s8
12688; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12689; GFX6-NEXT:    v_mov_b32_e32 v1, v2
12690; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
12691; GFX6-NEXT:    s_endpgm
12692;
12693; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12694; GFX7:       ; %bb.0: ; %entry
12695; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12696; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12697; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12698; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12699; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12700; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12701; GFX7-NEXT:    s_mov_b32 s4, s8
12702; GFX7-NEXT:    s_mov_b32 s5, s9
12703; GFX7-NEXT:    s_mov_b32 s9, s10
12704; GFX7-NEXT:    s_mov_b32 s8, s11
12705; GFX7-NEXT:    s_add_u32 s4, s4, s9
12706; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12707; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12708; GFX7-NEXT:    s_mov_b32 s5, s8
12709; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12710; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12711; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12712; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12713; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12714; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12715; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12716; GFX7-NEXT:    s_endpgm
12717;
12718; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12719; GFX10-WGP:       ; %bb.0: ; %entry
12720; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
12721; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12722; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
12723; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
12724; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12725; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12726; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
12727; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12728; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
12729; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12730; GFX10-WGP-NEXT:    s_endpgm
12731;
12732; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12733; GFX10-CU:       ; %bb.0: ; %entry
12734; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
12735; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12736; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
12737; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
12738; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12739; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12740; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
12741; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12742; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
12743; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12744; GFX10-CU-NEXT:    s_endpgm
12745;
12746; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12747; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12748; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
12749; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
12750; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
12751; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
12752; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12753; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
12754; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
12755; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
12756; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
12757; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
12758; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
12759; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
12760; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
12761; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
12762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
12763; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12764; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
12765; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
12766; SKIP-CACHE-INV-NEXT:    s_endpgm
12767;
12768; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12769; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12770; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12771; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12772; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12773; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12774; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12775; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12776; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12777; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12778; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12779; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12780; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12781;
12782; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12783; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
12784; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12785; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12786; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12787; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12788; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12789; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12790; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12791; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12792; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12793; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12794; GFX90A-TGSPLIT-NEXT:    s_endpgm
12795;
12796; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12797; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
12798; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12799; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12800; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12801; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12802; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12803; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12804; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12805; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12806; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12807; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12808; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
12809;
12810; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12811; GFX940-TGSPLIT:       ; %bb.0: ; %entry
12812; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12813; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
12814; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
12815; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
12816; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12817; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
12818; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
12819; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12820; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12821; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
12822; GFX940-TGSPLIT-NEXT:    s_endpgm
12823;
12824; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12825; GFX11-WGP:       ; %bb.0: ; %entry
12826; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
12827; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12828; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12829; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12830; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12831; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
12832; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
12833; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12834; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
12835; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12836; GFX11-WGP-NEXT:    s_endpgm
12837;
12838; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12839; GFX11-CU:       ; %bb.0: ; %entry
12840; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
12841; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12842; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12843; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12844; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
12845; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
12846; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
12847; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12848; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
12849; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12850; GFX11-CU-NEXT:    s_endpgm
12851;
12852; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12853; GFX12-WGP:       ; %bb.0: ; %entry
12854; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
12855; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12856; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
12857; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
12858; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
12859; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
12860; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
12861; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12862; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
12863; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12864; GFX12-WGP-NEXT:    s_endpgm
12865;
12866; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
12867; GFX12-CU:       ; %bb.0: ; %entry
12868; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
12869; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
12870; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
12871; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
12872; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
12873; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
12874; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
12875; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12876; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
12877; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
12878; GFX12-CU-NEXT:    s_endpgm
12879    ptr addrspace(1) %out, i32 %in, i32 %old) {
12880entry:
12881  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
12882  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
12883  ret void
12884}
12885
12886define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
12887; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12888; GFX6:       ; %bb.0: ; %entry
12889; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
12890; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
12891; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
12892; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
12893; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12894; GFX6-NEXT:    s_mov_b32 s12, s5
12895; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
12896; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
12897; GFX6-NEXT:    s_mov_b32 s11, -1
12898; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
12899; GFX6-NEXT:    s_mov_b32 s5, s12
12900; GFX6-NEXT:    s_mov_b32 s6, s11
12901; GFX6-NEXT:    s_mov_b32 s7, s10
12902; GFX6-NEXT:    v_mov_b32_e32 v0, s9
12903; GFX6-NEXT:    v_mov_b32_e32 v2, s8
12904; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12905; GFX6-NEXT:    v_mov_b32_e32 v1, v2
12906; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
12907; GFX6-NEXT:    s_endpgm
12908;
12909; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12910; GFX7:       ; %bb.0: ; %entry
12911; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
12912; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
12913; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
12914; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
12915; GFX7-NEXT:    s_mov_b64 s[10:11], 16
12916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12917; GFX7-NEXT:    s_mov_b32 s4, s8
12918; GFX7-NEXT:    s_mov_b32 s5, s9
12919; GFX7-NEXT:    s_mov_b32 s9, s10
12920; GFX7-NEXT:    s_mov_b32 s8, s11
12921; GFX7-NEXT:    s_add_u32 s4, s4, s9
12922; GFX7-NEXT:    s_addc_u32 s8, s5, s8
12923; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12924; GFX7-NEXT:    s_mov_b32 s5, s8
12925; GFX7-NEXT:    v_mov_b32_e32 v2, s7
12926; GFX7-NEXT:    v_mov_b32_e32 v0, s6
12927; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12928; GFX7-NEXT:    v_mov_b32_e32 v3, v0
12929; GFX7-NEXT:    v_mov_b32_e32 v0, s4
12930; GFX7-NEXT:    v_mov_b32_e32 v1, s5
12931; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
12932; GFX7-NEXT:    s_endpgm
12933;
12934; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12935; GFX10-WGP:       ; %bb.0: ; %entry
12936; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
12937; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12938; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
12939; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
12940; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
12941; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
12942; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
12943; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12944; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
12945; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12946; GFX10-WGP-NEXT:    s_endpgm
12947;
12948; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12949; GFX10-CU:       ; %bb.0: ; %entry
12950; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
12951; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12952; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
12953; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
12954; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
12955; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
12956; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
12957; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
12958; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
12959; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
12960; GFX10-CU-NEXT:    s_endpgm
12961;
12962; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12963; SKIP-CACHE-INV:       ; %bb.0: ; %entry
12964; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
12965; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
12966; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
12967; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
12968; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
12969; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
12970; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
12971; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
12972; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
12973; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
12974; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
12975; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
12976; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
12977; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
12978; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
12979; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
12980; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
12981; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
12982; SKIP-CACHE-INV-NEXT:    s_endpgm
12983;
12984; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12985; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
12986; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
12987; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
12988; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
12989; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
12990; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
12991; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
12992; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
12993; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12994; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
12995; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
12996; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
12997;
12998; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
12999; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13000; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13001; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13002; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13003; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13004; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13005; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13006; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13007; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13008; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13009; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13010; GFX90A-TGSPLIT-NEXT:    s_endpgm
13011;
13012; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13013; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13014; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13015; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13016; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13017; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13018; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13019; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13020; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13021; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13022; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13023; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13024; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13025;
13026; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13027; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13028; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13029; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13030; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13031; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13032; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13033; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13034; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13035; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13036; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13037; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13038; GFX940-TGSPLIT-NEXT:    s_endpgm
13039;
13040; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13041; GFX11-WGP:       ; %bb.0: ; %entry
13042; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
13043; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13044; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13045; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13046; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13047; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13048; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
13049; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13050; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
13051; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13052; GFX11-WGP-NEXT:    s_endpgm
13053;
13054; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13055; GFX11-CU:       ; %bb.0: ; %entry
13056; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
13057; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13058; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13059; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13060; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13061; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13062; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
13063; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13064; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
13065; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13066; GFX11-CU-NEXT:    s_endpgm
13067;
13068; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13069; GFX12-WGP:       ; %bb.0: ; %entry
13070; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
13071; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13072; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13073; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13074; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13075; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13076; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
13077; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13078; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
13079; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13080; GFX12-WGP-NEXT:    s_endpgm
13081;
13082; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
13083; GFX12-CU:       ; %bb.0: ; %entry
13084; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
13085; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13086; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13087; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13088; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13089; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13090; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
13091; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13092; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
13093; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13094; GFX12-CU-NEXT:    s_endpgm
13095    ptr addrspace(1) %out, i32 %in, i32 %old) {
13096entry:
13097  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
13098  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
13099  ret void
13100}
13101
13102define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
13103; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13104; GFX6:       ; %bb.0: ; %entry
13105; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
13106; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
13107; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
13108; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
13109; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13110; GFX6-NEXT:    s_mov_b32 s12, s5
13111; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
13112; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
13113; GFX6-NEXT:    s_mov_b32 s11, -1
13114; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
13115; GFX6-NEXT:    s_mov_b32 s5, s12
13116; GFX6-NEXT:    s_mov_b32 s6, s11
13117; GFX6-NEXT:    s_mov_b32 s7, s10
13118; GFX6-NEXT:    v_mov_b32_e32 v0, s9
13119; GFX6-NEXT:    v_mov_b32_e32 v2, s8
13120; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13121; GFX6-NEXT:    v_mov_b32_e32 v1, v2
13122; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
13123; GFX6-NEXT:    s_endpgm
13124;
13125; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13126; GFX7:       ; %bb.0: ; %entry
13127; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13128; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13129; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13130; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13131; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13132; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13133; GFX7-NEXT:    s_mov_b32 s4, s8
13134; GFX7-NEXT:    s_mov_b32 s5, s9
13135; GFX7-NEXT:    s_mov_b32 s9, s10
13136; GFX7-NEXT:    s_mov_b32 s8, s11
13137; GFX7-NEXT:    s_add_u32 s4, s4, s9
13138; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13139; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13140; GFX7-NEXT:    s_mov_b32 s5, s8
13141; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13142; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13143; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13144; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13145; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13146; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13147; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13148; GFX7-NEXT:    s_endpgm
13149;
13150; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13151; GFX10-WGP:       ; %bb.0: ; %entry
13152; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
13153; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13154; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
13155; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
13156; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13157; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13158; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
13159; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13160; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
13161; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13162; GFX10-WGP-NEXT:    s_endpgm
13163;
13164; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13165; GFX10-CU:       ; %bb.0: ; %entry
13166; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
13167; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13168; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
13169; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
13170; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13171; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13172; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
13173; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13174; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
13175; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13176; GFX10-CU-NEXT:    s_endpgm
13177;
13178; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13179; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13180; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
13181; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
13182; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
13183; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
13184; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13185; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
13186; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
13187; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
13188; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
13189; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
13190; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
13191; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
13192; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
13193; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
13194; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
13195; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13196; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
13197; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
13198; SKIP-CACHE-INV-NEXT:    s_endpgm
13199;
13200; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13201; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13202; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13203; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13204; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13205; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13206; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13207; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13208; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13209; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13210; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13211; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13212; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13213;
13214; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13215; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13216; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13217; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13218; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13219; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13220; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13221; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13222; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13223; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13224; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13225; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13226; GFX90A-TGSPLIT-NEXT:    s_endpgm
13227;
13228; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13229; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13230; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13231; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13232; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13233; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13234; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13235; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13236; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13237; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13238; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13239; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13240; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13241;
13242; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13243; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13244; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13245; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13246; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13247; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13248; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13249; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13250; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13251; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13252; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13253; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13254; GFX940-TGSPLIT-NEXT:    s_endpgm
13255;
13256; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13257; GFX11-WGP:       ; %bb.0: ; %entry
13258; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
13259; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13260; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13261; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13262; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13263; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13264; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
13265; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13266; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
13267; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13268; GFX11-WGP-NEXT:    s_endpgm
13269;
13270; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13271; GFX11-CU:       ; %bb.0: ; %entry
13272; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
13273; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13274; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13275; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13276; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13277; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13278; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
13279; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13280; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
13281; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13282; GFX11-CU-NEXT:    s_endpgm
13283;
13284; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13285; GFX12-WGP:       ; %bb.0: ; %entry
13286; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
13287; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13288; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13289; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13290; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13291; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13292; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
13293; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13294; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
13295; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13296; GFX12-WGP-NEXT:    s_endpgm
13297;
13298; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
13299; GFX12-CU:       ; %bb.0: ; %entry
13300; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
13301; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13302; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13303; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13304; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13305; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13306; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
13307; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13308; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
13309; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13310; GFX12-CU-NEXT:    s_endpgm
13311    ptr addrspace(1) %out, i32 %in, i32 %old) {
13312entry:
13313  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
13314  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
13315  ret void
13316}
13317
13318define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
13319; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13320; GFX6:       ; %bb.0: ; %entry
13321; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
13322; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
13323; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
13324; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
13325; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13326; GFX6-NEXT:    s_mov_b32 s12, s5
13327; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
13328; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
13329; GFX6-NEXT:    s_mov_b32 s11, -1
13330; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
13331; GFX6-NEXT:    s_mov_b32 s5, s12
13332; GFX6-NEXT:    s_mov_b32 s6, s11
13333; GFX6-NEXT:    s_mov_b32 s7, s10
13334; GFX6-NEXT:    v_mov_b32_e32 v0, s9
13335; GFX6-NEXT:    v_mov_b32_e32 v2, s8
13336; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13337; GFX6-NEXT:    v_mov_b32_e32 v1, v2
13338; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
13339; GFX6-NEXT:    s_endpgm
13340;
13341; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13342; GFX7:       ; %bb.0: ; %entry
13343; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13344; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13345; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13346; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13347; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13348; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13349; GFX7-NEXT:    s_mov_b32 s4, s8
13350; GFX7-NEXT:    s_mov_b32 s5, s9
13351; GFX7-NEXT:    s_mov_b32 s9, s10
13352; GFX7-NEXT:    s_mov_b32 s8, s11
13353; GFX7-NEXT:    s_add_u32 s4, s4, s9
13354; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13355; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13356; GFX7-NEXT:    s_mov_b32 s5, s8
13357; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13358; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13359; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13360; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13361; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13362; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13363; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13364; GFX7-NEXT:    s_endpgm
13365;
13366; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13367; GFX10-WGP:       ; %bb.0: ; %entry
13368; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
13369; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13370; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
13371; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
13372; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13373; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13374; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
13375; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13376; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
13377; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13378; GFX10-WGP-NEXT:    s_endpgm
13379;
13380; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13381; GFX10-CU:       ; %bb.0: ; %entry
13382; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
13383; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13384; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
13385; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
13386; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13388; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
13389; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13390; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
13391; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13392; GFX10-CU-NEXT:    s_endpgm
13393;
13394; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13395; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13396; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
13397; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
13398; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
13399; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
13400; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13401; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
13402; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
13403; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
13404; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
13405; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
13406; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
13407; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
13408; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
13409; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
13410; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
13411; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13412; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
13413; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
13414; SKIP-CACHE-INV-NEXT:    s_endpgm
13415;
13416; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13417; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13418; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13419; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13420; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13421; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13422; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13423; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13424; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13425; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13426; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13427; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13428; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13429;
13430; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13431; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13432; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13433; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13434; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13435; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13436; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13437; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13438; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13439; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13440; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13441; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13442; GFX90A-TGSPLIT-NEXT:    s_endpgm
13443;
13444; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13445; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13446; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13447; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13448; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13449; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13450; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13451; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13452; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13453; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13454; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13455; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13456; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13457;
13458; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13459; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13460; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13461; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13462; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13463; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13464; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13465; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13466; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13467; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13468; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13469; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13470; GFX940-TGSPLIT-NEXT:    s_endpgm
13471;
13472; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13473; GFX11-WGP:       ; %bb.0: ; %entry
13474; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
13475; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13476; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13477; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13478; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13479; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13480; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
13481; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13482; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
13483; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13484; GFX11-WGP-NEXT:    s_endpgm
13485;
13486; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13487; GFX11-CU:       ; %bb.0: ; %entry
13488; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
13489; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13490; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13491; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13492; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13493; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13494; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
13495; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13496; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
13497; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13498; GFX11-CU-NEXT:    s_endpgm
13499;
13500; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13501; GFX12-WGP:       ; %bb.0: ; %entry
13502; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
13503; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13504; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13505; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13506; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13507; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13508; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
13509; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13510; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
13511; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13512; GFX12-WGP-NEXT:    s_endpgm
13513;
13514; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
13515; GFX12-CU:       ; %bb.0: ; %entry
13516; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
13517; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13518; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13519; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13520; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13521; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13522; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
13523; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13524; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
13525; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13526; GFX12-CU-NEXT:    s_endpgm
13527    ptr addrspace(1) %out, i32 %in, i32 %old) {
13528entry:
13529  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
13530  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
13531  ret void
13532}
13533
13534define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
13535; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13536; GFX6:       ; %bb.0: ; %entry
13537; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
13538; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
13539; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
13540; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
13541; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13542; GFX6-NEXT:    s_mov_b32 s12, s5
13543; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
13544; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
13545; GFX6-NEXT:    s_mov_b32 s11, -1
13546; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
13547; GFX6-NEXT:    s_mov_b32 s5, s12
13548; GFX6-NEXT:    s_mov_b32 s6, s11
13549; GFX6-NEXT:    s_mov_b32 s7, s10
13550; GFX6-NEXT:    v_mov_b32_e32 v0, s9
13551; GFX6-NEXT:    v_mov_b32_e32 v2, s8
13552; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13553; GFX6-NEXT:    v_mov_b32_e32 v1, v2
13554; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
13555; GFX6-NEXT:    s_endpgm
13556;
13557; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13558; GFX7:       ; %bb.0: ; %entry
13559; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13560; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13561; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13562; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13563; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13564; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13565; GFX7-NEXT:    s_mov_b32 s4, s8
13566; GFX7-NEXT:    s_mov_b32 s5, s9
13567; GFX7-NEXT:    s_mov_b32 s9, s10
13568; GFX7-NEXT:    s_mov_b32 s8, s11
13569; GFX7-NEXT:    s_add_u32 s4, s4, s9
13570; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13571; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13572; GFX7-NEXT:    s_mov_b32 s5, s8
13573; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13574; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13575; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13576; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13577; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13578; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13579; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13580; GFX7-NEXT:    s_endpgm
13581;
13582; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13583; GFX10-WGP:       ; %bb.0: ; %entry
13584; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
13585; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13586; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
13587; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
13588; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13589; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13590; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
13591; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13592; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
13593; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13594; GFX10-WGP-NEXT:    s_endpgm
13595;
13596; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13597; GFX10-CU:       ; %bb.0: ; %entry
13598; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
13599; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13600; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
13601; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
13602; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13603; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13604; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
13605; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13606; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
13607; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13608; GFX10-CU-NEXT:    s_endpgm
13609;
13610; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13611; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13612; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
13613; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
13614; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
13615; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
13616; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13617; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
13618; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
13619; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
13620; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
13621; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
13622; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
13623; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
13624; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
13625; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
13626; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
13627; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13628; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
13629; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
13630; SKIP-CACHE-INV-NEXT:    s_endpgm
13631;
13632; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13633; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13634; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13635; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13636; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13637; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13638; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13639; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13640; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13641; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13642; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13643; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13644; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13645;
13646; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13647; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13648; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13649; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13650; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13651; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13652; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13653; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13654; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13655; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13656; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13657; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13658; GFX90A-TGSPLIT-NEXT:    s_endpgm
13659;
13660; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13661; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13662; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13663; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13664; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13665; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13666; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13667; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13668; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13669; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13670; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13671; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13672; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13673;
13674; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13675; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13676; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13677; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13678; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13679; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13680; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13681; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13682; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13683; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13684; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13685; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13686; GFX940-TGSPLIT-NEXT:    s_endpgm
13687;
13688; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13689; GFX11-WGP:       ; %bb.0: ; %entry
13690; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
13691; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13692; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13693; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13694; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13695; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13696; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
13697; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13698; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
13699; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13700; GFX11-WGP-NEXT:    s_endpgm
13701;
13702; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13703; GFX11-CU:       ; %bb.0: ; %entry
13704; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
13705; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13706; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13707; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13708; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13709; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13710; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
13711; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13712; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
13713; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13714; GFX11-CU-NEXT:    s_endpgm
13715;
13716; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13717; GFX12-WGP:       ; %bb.0: ; %entry
13718; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
13719; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13720; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13721; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13722; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13723; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13724; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
13725; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13726; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
13727; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13728; GFX12-WGP-NEXT:    s_endpgm
13729;
13730; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
13731; GFX12-CU:       ; %bb.0: ; %entry
13732; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
13733; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13734; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13735; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13736; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13737; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13738; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
13739; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13740; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
13741; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13742; GFX12-CU-NEXT:    s_endpgm
13743    ptr addrspace(1) %out, i32 %in, i32 %old) {
13744entry:
13745  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
13746  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
13747  ret void
13748}
13749
13750define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
13751; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13752; GFX6:       ; %bb.0: ; %entry
13753; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
13754; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
13755; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
13756; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
13757; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13758; GFX6-NEXT:    s_mov_b32 s12, s5
13759; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
13760; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
13761; GFX6-NEXT:    s_mov_b32 s11, -1
13762; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
13763; GFX6-NEXT:    s_mov_b32 s5, s12
13764; GFX6-NEXT:    s_mov_b32 s6, s11
13765; GFX6-NEXT:    s_mov_b32 s7, s10
13766; GFX6-NEXT:    v_mov_b32_e32 v0, s9
13767; GFX6-NEXT:    v_mov_b32_e32 v2, s8
13768; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13769; GFX6-NEXT:    v_mov_b32_e32 v1, v2
13770; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
13771; GFX6-NEXT:    s_endpgm
13772;
13773; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13774; GFX7:       ; %bb.0: ; %entry
13775; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13776; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13777; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13778; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13779; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13780; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13781; GFX7-NEXT:    s_mov_b32 s4, s8
13782; GFX7-NEXT:    s_mov_b32 s5, s9
13783; GFX7-NEXT:    s_mov_b32 s9, s10
13784; GFX7-NEXT:    s_mov_b32 s8, s11
13785; GFX7-NEXT:    s_add_u32 s4, s4, s9
13786; GFX7-NEXT:    s_addc_u32 s8, s5, s8
13787; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13788; GFX7-NEXT:    s_mov_b32 s5, s8
13789; GFX7-NEXT:    v_mov_b32_e32 v2, s7
13790; GFX7-NEXT:    v_mov_b32_e32 v0, s6
13791; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13792; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13793; GFX7-NEXT:    v_mov_b32_e32 v0, s4
13794; GFX7-NEXT:    v_mov_b32_e32 v1, s5
13795; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
13796; GFX7-NEXT:    s_endpgm
13797;
13798; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13799; GFX10-WGP:       ; %bb.0: ; %entry
13800; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
13801; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13802; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
13803; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
13804; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13805; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
13806; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
13807; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13808; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
13809; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13810; GFX10-WGP-NEXT:    s_endpgm
13811;
13812; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13813; GFX10-CU:       ; %bb.0: ; %entry
13814; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
13815; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13816; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
13817; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
13818; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
13819; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
13820; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
13821; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13822; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
13823; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
13824; GFX10-CU-NEXT:    s_endpgm
13825;
13826; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13827; SKIP-CACHE-INV:       ; %bb.0: ; %entry
13828; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
13829; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
13830; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
13831; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
13832; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
13833; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
13834; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
13835; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
13836; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
13837; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
13838; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
13839; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
13840; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
13841; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
13842; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
13843; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
13845; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
13846; SKIP-CACHE-INV-NEXT:    s_endpgm
13847;
13848; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13849; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
13850; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13851; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13852; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13853; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13854; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13855; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13856; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13857; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13858; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13859; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13860; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
13861;
13862; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13863; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
13864; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13865; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
13866; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
13867; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
13868; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13869; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
13870; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
13871; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13872; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13873; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
13874; GFX90A-TGSPLIT-NEXT:    s_endpgm
13875;
13876; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13877; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
13878; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13879; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13880; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13881; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13882; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13883; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13884; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13885; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13886; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13887; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13888; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
13889;
13890; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13891; GFX940-TGSPLIT:       ; %bb.0: ; %entry
13892; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
13893; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
13894; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
13895; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
13896; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
13897; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
13898; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
13899; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13900; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
13901; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
13902; GFX940-TGSPLIT-NEXT:    s_endpgm
13903;
13904; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13905; GFX11-WGP:       ; %bb.0: ; %entry
13906; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
13907; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13908; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13909; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13910; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
13911; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
13912; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
13913; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13914; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
13915; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13916; GFX11-WGP-NEXT:    s_endpgm
13917;
13918; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13919; GFX11-CU:       ; %bb.0: ; %entry
13920; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
13921; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13922; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13923; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13924; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
13925; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
13926; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
13927; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13928; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
13929; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13930; GFX11-CU-NEXT:    s_endpgm
13931;
13932; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13933; GFX12-WGP:       ; %bb.0: ; %entry
13934; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
13935; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13936; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
13937; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
13938; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
13939; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
13940; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
13941; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13942; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
13943; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13944; GFX12-WGP-NEXT:    s_endpgm
13945;
13946; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
13947; GFX12-CU:       ; %bb.0: ; %entry
13948; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
13949; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
13950; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
13951; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
13952; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
13953; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
13954; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
13955; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
13956; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
13957; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
13958; GFX12-CU-NEXT:    s_endpgm
13959    ptr addrspace(1) %out, i32 %in, i32 %old) {
13960entry:
13961  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
13962  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
13963  ret void
13964}
13965
13966define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
13967; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
13968; GFX6:       ; %bb.0: ; %entry
13969; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
13970; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
13971; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
13972; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
13973; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13974; GFX6-NEXT:    s_mov_b32 s12, s5
13975; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
13976; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
13977; GFX6-NEXT:    s_mov_b32 s11, -1
13978; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
13979; GFX6-NEXT:    s_mov_b32 s5, s12
13980; GFX6-NEXT:    s_mov_b32 s6, s11
13981; GFX6-NEXT:    s_mov_b32 s7, s10
13982; GFX6-NEXT:    v_mov_b32_e32 v0, s9
13983; GFX6-NEXT:    v_mov_b32_e32 v2, s8
13984; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
13985; GFX6-NEXT:    v_mov_b32_e32 v1, v2
13986; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
13987; GFX6-NEXT:    s_endpgm
13988;
13989; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
13990; GFX7:       ; %bb.0: ; %entry
13991; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
13992; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
13993; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
13994; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
13995; GFX7-NEXT:    s_mov_b64 s[10:11], 16
13996; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
13997; GFX7-NEXT:    s_mov_b32 s4, s8
13998; GFX7-NEXT:    s_mov_b32 s5, s9
13999; GFX7-NEXT:    s_mov_b32 s9, s10
14000; GFX7-NEXT:    s_mov_b32 s8, s11
14001; GFX7-NEXT:    s_add_u32 s4, s4, s9
14002; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14003; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14004; GFX7-NEXT:    s_mov_b32 s5, s8
14005; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14006; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14007; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14008; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14009; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14010; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14011; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14012; GFX7-NEXT:    s_endpgm
14013;
14014; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14015; GFX10-WGP:       ; %bb.0: ; %entry
14016; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
14017; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14018; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
14019; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
14020; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14021; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
14022; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
14023; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14024; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
14025; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14026; GFX10-WGP-NEXT:    s_endpgm
14027;
14028; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14029; GFX10-CU:       ; %bb.0: ; %entry
14030; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
14031; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14032; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
14033; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
14034; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14035; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
14036; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
14037; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14038; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
14039; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14040; GFX10-CU-NEXT:    s_endpgm
14041;
14042; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14043; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14044; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
14045; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
14046; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
14047; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
14048; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14049; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
14050; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
14051; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
14052; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
14053; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
14054; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
14055; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
14056; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
14057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
14058; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
14059; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
14061; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
14062; SKIP-CACHE-INV-NEXT:    s_endpgm
14063;
14064; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14065; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14066; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14067; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14068; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14069; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14070; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14071; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14072; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14073; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14074; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14075; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14076; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14077;
14078; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14079; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14081; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14082; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14083; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14084; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14085; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14086; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14087; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14088; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14089; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14090; GFX90A-TGSPLIT-NEXT:    s_endpgm
14091;
14092; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14093; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14094; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14095; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14096; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14097; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14098; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14099; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14100; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14101; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14102; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14103; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14104; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14105;
14106; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14107; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14108; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14109; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14110; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14111; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14112; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14113; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14114; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14115; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14116; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14117; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14118; GFX940-TGSPLIT-NEXT:    s_endpgm
14119;
14120; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14121; GFX11-WGP:       ; %bb.0: ; %entry
14122; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
14123; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14124; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14125; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14126; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14127; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
14128; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
14129; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14130; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
14131; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14132; GFX11-WGP-NEXT:    s_endpgm
14133;
14134; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14135; GFX11-CU:       ; %bb.0: ; %entry
14136; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
14137; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14138; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14139; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14140; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14141; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
14142; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
14143; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14144; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
14145; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14146; GFX11-CU-NEXT:    s_endpgm
14147;
14148; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14149; GFX12-WGP:       ; %bb.0: ; %entry
14150; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
14151; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14152; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14153; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14154; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14155; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
14156; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
14157; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14158; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
14159; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14160; GFX12-WGP-NEXT:    s_endpgm
14161;
14162; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
14163; GFX12-CU:       ; %bb.0: ; %entry
14164; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
14165; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14166; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14167; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14168; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14169; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
14170; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
14171; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14172; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
14173; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14174; GFX12-CU-NEXT:    s_endpgm
14175    ptr addrspace(1) %out, i32 %in, i32 %old) {
14176entry:
14177  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
14178  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
14179  ret void
14180}
14181
14182define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
14183; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14184; GFX6:       ; %bb.0: ; %entry
14185; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
14186; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
14187; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
14188; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
14189; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
14190; GFX6-NEXT:    s_mov_b32 s12, s5
14191; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
14192; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
14193; GFX6-NEXT:    s_mov_b32 s11, -1
14194; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
14195; GFX6-NEXT:    s_mov_b32 s5, s12
14196; GFX6-NEXT:    s_mov_b32 s6, s11
14197; GFX6-NEXT:    s_mov_b32 s7, s10
14198; GFX6-NEXT:    v_mov_b32_e32 v0, s9
14199; GFX6-NEXT:    v_mov_b32_e32 v2, s8
14200; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14201; GFX6-NEXT:    v_mov_b32_e32 v1, v2
14202; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
14203; GFX6-NEXT:    s_endpgm
14204;
14205; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14206; GFX7:       ; %bb.0: ; %entry
14207; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14208; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14209; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14210; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14211; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14212; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14213; GFX7-NEXT:    s_mov_b32 s4, s8
14214; GFX7-NEXT:    s_mov_b32 s5, s9
14215; GFX7-NEXT:    s_mov_b32 s9, s10
14216; GFX7-NEXT:    s_mov_b32 s8, s11
14217; GFX7-NEXT:    s_add_u32 s4, s4, s9
14218; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14219; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14220; GFX7-NEXT:    s_mov_b32 s5, s8
14221; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14222; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14223; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14224; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14225; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14226; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14227; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14228; GFX7-NEXT:    s_endpgm
14229;
14230; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14231; GFX10-WGP:       ; %bb.0: ; %entry
14232; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
14233; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14234; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
14235; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
14236; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14237; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
14238; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
14239; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14240; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
14241; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14242; GFX10-WGP-NEXT:    s_endpgm
14243;
14244; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14245; GFX10-CU:       ; %bb.0: ; %entry
14246; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
14247; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14248; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
14249; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
14250; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14251; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
14252; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
14253; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14254; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
14255; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14256; GFX10-CU-NEXT:    s_endpgm
14257;
14258; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14259; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14260; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
14261; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
14262; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
14263; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
14264; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14265; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
14266; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
14267; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
14268; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
14269; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
14270; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
14271; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
14272; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
14273; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
14274; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
14275; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14276; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
14277; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
14278; SKIP-CACHE-INV-NEXT:    s_endpgm
14279;
14280; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14281; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14282; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14283; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14284; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14285; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14286; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14287; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14288; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14289; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14290; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14291; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14292; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14293;
14294; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14295; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14296; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14297; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14298; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14299; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14300; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14301; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14302; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14303; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14304; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14305; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14306; GFX90A-TGSPLIT-NEXT:    s_endpgm
14307;
14308; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14309; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14310; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14311; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14312; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14313; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14314; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14315; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14316; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14317; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14318; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14319; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14320; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14321;
14322; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14323; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14324; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14325; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14326; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14327; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14328; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14329; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14330; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14331; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14332; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14333; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14334; GFX940-TGSPLIT-NEXT:    s_endpgm
14335;
14336; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14337; GFX11-WGP:       ; %bb.0: ; %entry
14338; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
14339; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14340; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14341; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14342; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14343; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
14344; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
14345; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14346; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
14347; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14348; GFX11-WGP-NEXT:    s_endpgm
14349;
14350; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14351; GFX11-CU:       ; %bb.0: ; %entry
14352; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
14353; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14354; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14355; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14356; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14357; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
14358; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
14359; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14360; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
14361; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14362; GFX11-CU-NEXT:    s_endpgm
14363;
14364; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14365; GFX12-WGP:       ; %bb.0: ; %entry
14366; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
14367; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14368; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14369; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14370; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14371; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
14372; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
14373; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14374; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
14375; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14376; GFX12-WGP-NEXT:    s_endpgm
14377;
14378; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
14379; GFX12-CU:       ; %bb.0: ; %entry
14380; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
14381; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14382; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14383; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14384; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14385; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
14386; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
14387; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14388; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
14389; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14390; GFX12-CU-NEXT:    s_endpgm
14391    ptr addrspace(1) %out, i32 %in, i32 %old) {
14392entry:
14393  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
14394  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
14395  ret void
14396}
14397
14398define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
14399; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14400; GFX6:       ; %bb.0: ; %entry
14401; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
14402; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
14403; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
14404; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
14405; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
14406; GFX6-NEXT:    s_mov_b32 s12, s5
14407; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
14408; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
14409; GFX6-NEXT:    s_mov_b32 s11, -1
14410; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
14411; GFX6-NEXT:    s_mov_b32 s5, s12
14412; GFX6-NEXT:    s_mov_b32 s6, s11
14413; GFX6-NEXT:    s_mov_b32 s7, s10
14414; GFX6-NEXT:    v_mov_b32_e32 v0, s9
14415; GFX6-NEXT:    v_mov_b32_e32 v2, s8
14416; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14417; GFX6-NEXT:    v_mov_b32_e32 v1, v2
14418; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
14419; GFX6-NEXT:    s_endpgm
14420;
14421; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14422; GFX7:       ; %bb.0: ; %entry
14423; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14424; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14425; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14426; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14427; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14428; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14429; GFX7-NEXT:    s_mov_b32 s4, s8
14430; GFX7-NEXT:    s_mov_b32 s5, s9
14431; GFX7-NEXT:    s_mov_b32 s9, s10
14432; GFX7-NEXT:    s_mov_b32 s8, s11
14433; GFX7-NEXT:    s_add_u32 s4, s4, s9
14434; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14435; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14436; GFX7-NEXT:    s_mov_b32 s5, s8
14437; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14438; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14439; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14440; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14441; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14442; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14443; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14444; GFX7-NEXT:    s_endpgm
14445;
14446; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14447; GFX10-WGP:       ; %bb.0: ; %entry
14448; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
14449; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14450; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
14451; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
14452; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14453; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
14454; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
14455; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14456; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
14457; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14458; GFX10-WGP-NEXT:    s_endpgm
14459;
14460; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14461; GFX10-CU:       ; %bb.0: ; %entry
14462; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
14463; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14464; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
14465; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
14466; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14467; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
14468; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
14469; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14470; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
14471; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14472; GFX10-CU-NEXT:    s_endpgm
14473;
14474; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14475; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14476; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
14477; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
14478; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
14479; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
14480; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14481; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
14482; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
14483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
14484; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
14485; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
14486; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
14487; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
14488; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
14489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
14490; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
14491; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14492; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
14493; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
14494; SKIP-CACHE-INV-NEXT:    s_endpgm
14495;
14496; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14497; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14498; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14499; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14500; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14501; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14502; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14503; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14504; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14505; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14506; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14507; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14508; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14509;
14510; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14511; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14512; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14513; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14514; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14515; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14516; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14517; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14518; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14519; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14520; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14521; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14522; GFX90A-TGSPLIT-NEXT:    s_endpgm
14523;
14524; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14525; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14526; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14527; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14528; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14529; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14530; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14531; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14532; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14533; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14534; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14535; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14536; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14537;
14538; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14539; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14540; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14541; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14542; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14543; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14544; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14545; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14546; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14547; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14548; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14549; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14550; GFX940-TGSPLIT-NEXT:    s_endpgm
14551;
14552; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14553; GFX11-WGP:       ; %bb.0: ; %entry
14554; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
14555; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14556; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14557; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14558; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14559; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
14560; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
14561; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14562; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
14563; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14564; GFX11-WGP-NEXT:    s_endpgm
14565;
14566; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14567; GFX11-CU:       ; %bb.0: ; %entry
14568; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
14569; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14570; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14571; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14572; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14573; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
14574; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
14575; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14576; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
14577; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14578; GFX11-CU-NEXT:    s_endpgm
14579;
14580; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14581; GFX12-WGP:       ; %bb.0: ; %entry
14582; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
14583; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14584; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14585; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14586; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14587; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
14588; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
14589; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14590; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
14591; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14592; GFX12-WGP-NEXT:    s_endpgm
14593;
14594; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
14595; GFX12-CU:       ; %bb.0: ; %entry
14596; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
14597; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14598; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14599; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14600; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14601; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
14602; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
14603; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14604; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
14605; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14606; GFX12-CU-NEXT:    s_endpgm
14607    ptr addrspace(1) %out, i32 %in, i32 %old) {
14608entry:
14609  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
14610  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
14611  ret void
14612}
14613
14614define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
14615; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14616; GFX6:       ; %bb.0: ; %entry
14617; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
14618; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
14619; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
14620; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
14621; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
14622; GFX6-NEXT:    s_mov_b32 s12, s5
14623; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
14624; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
14625; GFX6-NEXT:    s_mov_b32 s11, -1
14626; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
14627; GFX6-NEXT:    s_mov_b32 s5, s12
14628; GFX6-NEXT:    s_mov_b32 s6, s11
14629; GFX6-NEXT:    s_mov_b32 s7, s10
14630; GFX6-NEXT:    v_mov_b32_e32 v0, s9
14631; GFX6-NEXT:    v_mov_b32_e32 v2, s8
14632; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14633; GFX6-NEXT:    v_mov_b32_e32 v1, v2
14634; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
14635; GFX6-NEXT:    s_endpgm
14636;
14637; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14638; GFX7:       ; %bb.0: ; %entry
14639; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14640; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14641; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14642; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14643; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14644; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14645; GFX7-NEXT:    s_mov_b32 s4, s8
14646; GFX7-NEXT:    s_mov_b32 s5, s9
14647; GFX7-NEXT:    s_mov_b32 s9, s10
14648; GFX7-NEXT:    s_mov_b32 s8, s11
14649; GFX7-NEXT:    s_add_u32 s4, s4, s9
14650; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14651; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14652; GFX7-NEXT:    s_mov_b32 s5, s8
14653; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14654; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14655; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14656; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14657; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14658; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14659; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14660; GFX7-NEXT:    s_endpgm
14661;
14662; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14663; GFX10-WGP:       ; %bb.0: ; %entry
14664; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
14665; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14666; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
14667; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
14668; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14669; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
14670; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
14671; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14672; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
14673; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14674; GFX10-WGP-NEXT:    s_endpgm
14675;
14676; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14677; GFX10-CU:       ; %bb.0: ; %entry
14678; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
14679; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14680; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
14681; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
14682; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14683; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
14684; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
14685; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14686; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
14687; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14688; GFX10-CU-NEXT:    s_endpgm
14689;
14690; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14691; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14692; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
14693; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
14694; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
14695; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
14696; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14697; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
14698; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
14699; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
14700; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
14701; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
14702; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
14703; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
14704; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
14705; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
14706; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
14707; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14708; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
14709; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
14710; SKIP-CACHE-INV-NEXT:    s_endpgm
14711;
14712; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14713; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14714; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14715; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14716; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14717; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14718; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14719; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14720; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14721; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14722; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14723; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14724; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14725;
14726; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14727; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14728; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14729; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14730; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14731; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14732; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14733; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14734; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14735; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14736; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14737; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14738; GFX90A-TGSPLIT-NEXT:    s_endpgm
14739;
14740; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14741; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14742; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14743; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14744; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14745; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14746; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14747; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14748; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14749; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14750; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14751; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14752; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14753;
14754; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14755; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14756; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14757; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14758; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14759; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14760; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14761; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14762; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14763; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14764; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14765; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14766; GFX940-TGSPLIT-NEXT:    s_endpgm
14767;
14768; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14769; GFX11-WGP:       ; %bb.0: ; %entry
14770; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
14771; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14772; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14773; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14774; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14775; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
14776; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
14777; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14778; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
14779; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14780; GFX11-WGP-NEXT:    s_endpgm
14781;
14782; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14783; GFX11-CU:       ; %bb.0: ; %entry
14784; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
14785; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14786; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14787; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14788; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
14789; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
14790; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
14791; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14792; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
14793; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14794; GFX11-CU-NEXT:    s_endpgm
14795;
14796; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14797; GFX12-WGP:       ; %bb.0: ; %entry
14798; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
14799; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14800; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14801; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14802; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
14803; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
14804; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
14805; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14806; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
14807; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14808; GFX12-WGP-NEXT:    s_endpgm
14809;
14810; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
14811; GFX12-CU:       ; %bb.0: ; %entry
14812; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
14813; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14814; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
14815; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
14816; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
14817; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
14818; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
14819; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14820; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
14821; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14822; GFX12-CU-NEXT:    s_endpgm
14823    ptr addrspace(1) %out, i32 %in, i32 %old) {
14824entry:
14825  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
14826  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
14827  ret void
14828}
14829
14830define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
14831; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14832; GFX6:       ; %bb.0: ; %entry
14833; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
14834; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
14835; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
14836; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
14837; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
14838; GFX6-NEXT:    s_mov_b32 s12, s5
14839; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
14840; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
14841; GFX6-NEXT:    s_mov_b32 s11, -1
14842; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
14843; GFX6-NEXT:    s_mov_b32 s5, s12
14844; GFX6-NEXT:    s_mov_b32 s6, s11
14845; GFX6-NEXT:    s_mov_b32 s7, s10
14846; GFX6-NEXT:    v_mov_b32_e32 v0, s9
14847; GFX6-NEXT:    v_mov_b32_e32 v2, s8
14848; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14849; GFX6-NEXT:    v_mov_b32_e32 v1, v2
14850; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
14851; GFX6-NEXT:    s_endpgm
14852;
14853; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14854; GFX7:       ; %bb.0: ; %entry
14855; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
14856; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
14857; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
14858; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
14859; GFX7-NEXT:    s_mov_b64 s[10:11], 16
14860; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
14861; GFX7-NEXT:    s_mov_b32 s4, s8
14862; GFX7-NEXT:    s_mov_b32 s5, s9
14863; GFX7-NEXT:    s_mov_b32 s9, s10
14864; GFX7-NEXT:    s_mov_b32 s8, s11
14865; GFX7-NEXT:    s_add_u32 s4, s4, s9
14866; GFX7-NEXT:    s_addc_u32 s8, s5, s8
14867; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14868; GFX7-NEXT:    s_mov_b32 s5, s8
14869; GFX7-NEXT:    v_mov_b32_e32 v2, s7
14870; GFX7-NEXT:    v_mov_b32_e32 v0, s6
14871; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14872; GFX7-NEXT:    v_mov_b32_e32 v3, v0
14873; GFX7-NEXT:    v_mov_b32_e32 v0, s4
14874; GFX7-NEXT:    v_mov_b32_e32 v1, s5
14875; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
14876; GFX7-NEXT:    s_endpgm
14877;
14878; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14879; GFX10-WGP:       ; %bb.0: ; %entry
14880; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
14881; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14882; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
14883; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
14884; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14885; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
14886; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
14887; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14888; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
14889; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14890; GFX10-WGP-NEXT:    s_endpgm
14891;
14892; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14893; GFX10-CU:       ; %bb.0: ; %entry
14894; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
14895; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14896; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
14897; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
14898; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
14899; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
14900; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
14901; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14902; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
14903; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
14904; GFX10-CU-NEXT:    s_endpgm
14905;
14906; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14907; SKIP-CACHE-INV:       ; %bb.0: ; %entry
14908; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
14909; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
14910; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
14911; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
14912; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
14913; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
14914; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
14915; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
14916; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
14917; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
14918; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
14919; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
14920; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
14921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
14922; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
14923; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
14924; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
14925; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
14926; SKIP-CACHE-INV-NEXT:    s_endpgm
14927;
14928; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14929; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
14930; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14931; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14932; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14933; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14934; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14935; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14936; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14937; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14938; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14939; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14940; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
14941;
14942; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14943; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
14944; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14945; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
14946; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
14947; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
14948; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14949; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
14950; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
14951; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14952; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14953; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
14954; GFX90A-TGSPLIT-NEXT:    s_endpgm
14955;
14956; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14957; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
14958; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14959; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14960; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14961; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14962; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14963; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14964; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14965; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14966; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14967; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14968; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
14969;
14970; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14971; GFX940-TGSPLIT:       ; %bb.0: ; %entry
14972; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
14973; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
14974; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
14975; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
14976; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
14977; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
14978; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
14979; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14980; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
14981; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
14982; GFX940-TGSPLIT-NEXT:    s_endpgm
14983;
14984; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14985; GFX11-WGP:       ; %bb.0: ; %entry
14986; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
14987; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
14988; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
14989; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
14990; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
14991; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
14992; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
14993; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
14994; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
14995; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
14996; GFX11-WGP-NEXT:    s_endpgm
14997;
14998; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
14999; GFX11-CU:       ; %bb.0: ; %entry
15000; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
15001; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15002; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15003; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15004; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15005; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
15006; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
15007; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15008; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
15009; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15010; GFX11-CU-NEXT:    s_endpgm
15011;
15012; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
15013; GFX12-WGP:       ; %bb.0: ; %entry
15014; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
15015; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15016; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15017; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15018; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15019; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
15020; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
15021; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15022; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
15023; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15024; GFX12-WGP-NEXT:    s_endpgm
15025;
15026; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
15027; GFX12-CU:       ; %bb.0: ; %entry
15028; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
15029; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15030; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15031; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15032; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15033; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
15034; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
15035; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15036; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
15037; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15038; GFX12-CU-NEXT:    s_endpgm
15039    ptr addrspace(1) %out, i32 %in, i32 %old) {
15040entry:
15041  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
15042  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
15043  ret void
15044}
15045
15046define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
15047; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15048; GFX6:       ; %bb.0: ; %entry
15049; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
15050; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15051; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
15052; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
15053; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15054; GFX6-NEXT:    s_mov_b32 s12, s5
15055; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
15056; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
15057; GFX6-NEXT:    s_mov_b32 s11, -1
15058; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
15059; GFX6-NEXT:    s_mov_b32 s5, s12
15060; GFX6-NEXT:    s_mov_b32 s6, s11
15061; GFX6-NEXT:    s_mov_b32 s7, s10
15062; GFX6-NEXT:    v_mov_b32_e32 v0, s9
15063; GFX6-NEXT:    v_mov_b32_e32 v2, s8
15064; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15065; GFX6-NEXT:    v_mov_b32_e32 v1, v2
15066; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
15067; GFX6-NEXT:    s_endpgm
15068;
15069; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15070; GFX7:       ; %bb.0: ; %entry
15071; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15072; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15073; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15074; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15075; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15076; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15077; GFX7-NEXT:    s_mov_b32 s4, s8
15078; GFX7-NEXT:    s_mov_b32 s5, s9
15079; GFX7-NEXT:    s_mov_b32 s9, s10
15080; GFX7-NEXT:    s_mov_b32 s8, s11
15081; GFX7-NEXT:    s_add_u32 s4, s4, s9
15082; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15083; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15084; GFX7-NEXT:    s_mov_b32 s5, s8
15085; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15086; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15087; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15088; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15089; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15090; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15091; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15092; GFX7-NEXT:    s_endpgm
15093;
15094; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15095; GFX10-WGP:       ; %bb.0: ; %entry
15096; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
15097; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15098; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
15099; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
15100; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15101; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15102; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
15103; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15104; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
15105; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
15106; GFX10-WGP-NEXT:    s_endpgm
15107;
15108; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15109; GFX10-CU:       ; %bb.0: ; %entry
15110; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
15111; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15112; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
15113; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
15114; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15115; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15116; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
15117; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15118; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
15119; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
15120; GFX10-CU-NEXT:    s_endpgm
15121;
15122; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15123; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15124; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15125; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15126; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15127; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15128; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15129; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
15130; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
15131; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
15132; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
15133; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
15134; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
15135; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
15136; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15137; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
15138; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
15139; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15140; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
15141; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
15142; SKIP-CACHE-INV-NEXT:    s_endpgm
15143;
15144; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15145; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15146; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15147; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15148; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15149; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15150; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15151; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15152; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15153; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15154; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15155; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
15156; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15157;
15158; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15159; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15160; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15161; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15162; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15163; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15164; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15165; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15166; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15167; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15168; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15169; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
15170; GFX90A-TGSPLIT-NEXT:    s_endpgm
15171;
15172; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15173; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15174; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15175; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15176; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15177; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15178; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15179; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15180; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15181; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15182; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15183; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
15184; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15185;
15186; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15187; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15188; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15189; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15190; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15191; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15192; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15193; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15194; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15195; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15196; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15197; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
15198; GFX940-TGSPLIT-NEXT:    s_endpgm
15199;
15200; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15201; GFX11-WGP:       ; %bb.0: ; %entry
15202; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
15203; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15204; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15205; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15206; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15207; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
15208; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
15209; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15210; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
15211; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15212; GFX11-WGP-NEXT:    s_endpgm
15213;
15214; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15215; GFX11-CU:       ; %bb.0: ; %entry
15216; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
15217; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15218; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15219; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15220; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15221; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
15222; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
15223; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15224; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
15225; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15226; GFX11-CU-NEXT:    s_endpgm
15227;
15228; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15229; GFX12-WGP:       ; %bb.0: ; %entry
15230; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
15231; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15232; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15233; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15234; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15235; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
15236; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
15237; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15238; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
15239; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15240; GFX12-WGP-NEXT:    s_endpgm
15241;
15242; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
15243; GFX12-CU:       ; %bb.0: ; %entry
15244; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
15245; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15246; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15247; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15248; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15249; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
15250; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
15251; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15252; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
15253; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15254; GFX12-CU-NEXT:    s_endpgm
15255    ptr addrspace(1) %out, i32 %in, i32 %old) {
15256entry:
15257  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
15258  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
15259  ret void
15260}
15261
15262define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
15263; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15264; GFX6:       ; %bb.0: ; %entry
15265; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
15266; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15267; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
15268; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
15269; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15270; GFX6-NEXT:    s_mov_b32 s12, s5
15271; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
15272; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
15273; GFX6-NEXT:    s_mov_b32 s11, -1
15274; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
15275; GFX6-NEXT:    s_mov_b32 s5, s12
15276; GFX6-NEXT:    s_mov_b32 s6, s11
15277; GFX6-NEXT:    s_mov_b32 s7, s10
15278; GFX6-NEXT:    v_mov_b32_e32 v0, s9
15279; GFX6-NEXT:    v_mov_b32_e32 v2, s8
15280; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15281; GFX6-NEXT:    v_mov_b32_e32 v1, v2
15282; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
15283; GFX6-NEXT:    s_endpgm
15284;
15285; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15286; GFX7:       ; %bb.0: ; %entry
15287; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
15288; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
15289; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
15290; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
15291; GFX7-NEXT:    s_mov_b64 s[10:11], 16
15292; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15293; GFX7-NEXT:    s_mov_b32 s4, s8
15294; GFX7-NEXT:    s_mov_b32 s5, s9
15295; GFX7-NEXT:    s_mov_b32 s9, s10
15296; GFX7-NEXT:    s_mov_b32 s8, s11
15297; GFX7-NEXT:    s_add_u32 s4, s4, s9
15298; GFX7-NEXT:    s_addc_u32 s8, s5, s8
15299; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15300; GFX7-NEXT:    s_mov_b32 s5, s8
15301; GFX7-NEXT:    v_mov_b32_e32 v2, s7
15302; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15303; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15304; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15305; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15306; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15307; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
15308; GFX7-NEXT:    s_endpgm
15309;
15310; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15311; GFX10-WGP:       ; %bb.0: ; %entry
15312; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
15313; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15314; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
15315; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
15316; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15317; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15318; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
15319; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15320; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
15321; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
15322; GFX10-WGP-NEXT:    s_endpgm
15323;
15324; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15325; GFX10-CU:       ; %bb.0: ; %entry
15326; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
15327; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15328; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
15329; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
15330; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15331; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15332; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
15333; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15334; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
15335; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
15336; GFX10-CU-NEXT:    s_endpgm
15337;
15338; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15339; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15340; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15341; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15342; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15343; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15344; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15345; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
15346; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
15347; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
15348; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
15349; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
15350; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
15351; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
15352; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
15354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
15355; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
15357; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
15358; SKIP-CACHE-INV-NEXT:    s_endpgm
15359;
15360; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15361; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15362; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15363; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15364; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15365; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15366; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15367; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15368; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15369; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15370; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15371; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
15372; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15373;
15374; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15375; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15376; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15377; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15378; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15379; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15380; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15381; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15382; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15383; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15384; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15385; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
15386; GFX90A-TGSPLIT-NEXT:    s_endpgm
15387;
15388; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15389; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15390; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15391; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15392; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15393; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15394; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15395; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15396; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15397; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15398; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15399; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
15400; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15401;
15402; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15403; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15404; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15405; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15406; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15407; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15408; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15409; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15410; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15411; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15412; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15413; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
15414; GFX940-TGSPLIT-NEXT:    s_endpgm
15415;
15416; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15417; GFX11-WGP:       ; %bb.0: ; %entry
15418; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
15419; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15420; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15421; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15422; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15423; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
15424; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
15425; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15426; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
15427; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15428; GFX11-WGP-NEXT:    s_endpgm
15429;
15430; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15431; GFX11-CU:       ; %bb.0: ; %entry
15432; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
15433; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15434; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15435; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15436; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15437; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
15438; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
15439; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15440; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
15441; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15442; GFX11-CU-NEXT:    s_endpgm
15443;
15444; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15445; GFX12-WGP:       ; %bb.0: ; %entry
15446; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
15447; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15448; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15449; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15450; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15451; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
15452; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
15453; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15454; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
15455; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15456; GFX12-WGP-NEXT:    s_endpgm
15457;
15458; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
15459; GFX12-CU:       ; %bb.0: ; %entry
15460; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
15461; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15462; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15463; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15464; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15465; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
15466; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
15467; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15468; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
15469; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
15470; GFX12-CU-NEXT:    s_endpgm
15471    ptr addrspace(1) %out, i32 %in, i32 %old) {
15472entry:
15473  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
15474  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
15475  ret void
15476}
15477
15478define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
15479; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15480; GFX6:       ; %bb.0: ; %entry
15481; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
15482; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15483; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
15484; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
15485; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15486; GFX6-NEXT:    s_mov_b32 s12, s5
15487; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
15488; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
15489; GFX6-NEXT:    s_mov_b32 s11, -1
15490; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
15491; GFX6-NEXT:    s_mov_b32 s5, s12
15492; GFX6-NEXT:    s_mov_b32 s6, s11
15493; GFX6-NEXT:    s_mov_b32 s7, s10
15494; GFX6-NEXT:    v_mov_b32_e32 v0, s9
15495; GFX6-NEXT:    v_mov_b32_e32 v2, s8
15496; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15497; GFX6-NEXT:    v_mov_b32_e32 v1, v2
15498; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
15499; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
15500; GFX6-NEXT:    s_waitcnt vmcnt(0)
15501; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
15502; GFX6-NEXT:    s_endpgm
15503;
15504; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15505; GFX7:       ; %bb.0: ; %entry
15506; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
15507; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15508; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
15509; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
15510; GFX7-NEXT:    s_mov_b64 s[12:13], 16
15511; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15512; GFX7-NEXT:    s_mov_b32 s6, s4
15513; GFX7-NEXT:    s_mov_b32 s7, s5
15514; GFX7-NEXT:    s_mov_b32 s11, s12
15515; GFX7-NEXT:    s_mov_b32 s10, s13
15516; GFX7-NEXT:    s_add_u32 s6, s6, s11
15517; GFX7-NEXT:    s_addc_u32 s10, s7, s10
15518; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15519; GFX7-NEXT:    s_mov_b32 s7, s10
15520; GFX7-NEXT:    v_mov_b32_e32 v2, s9
15521; GFX7-NEXT:    v_mov_b32_e32 v0, s8
15522; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15523; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15524; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15525; GFX7-NEXT:    v_mov_b32_e32 v1, s7
15526; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15527; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15528; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15529; GFX7-NEXT:    s_waitcnt vmcnt(0)
15530; GFX7-NEXT:    flat_store_dword v[0:1], v2
15531; GFX7-NEXT:    s_endpgm
15532;
15533; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15534; GFX10-WGP:       ; %bb.0: ; %entry
15535; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
15536; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15537; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
15538; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
15539; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15540; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15541; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
15542; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15543; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
15544; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
15545; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15546; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
15547; GFX10-WGP-NEXT:    s_endpgm
15548;
15549; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15550; GFX10-CU:       ; %bb.0: ; %entry
15551; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
15552; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15553; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
15554; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
15555; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15556; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15557; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
15558; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15559; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
15560; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
15561; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
15562; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
15563; GFX10-CU-NEXT:    s_endpgm
15564;
15565; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15566; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15567; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15568; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15569; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15570; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15571; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15572; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
15573; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
15574; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
15575; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
15576; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
15577; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
15578; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
15579; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15580; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
15581; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
15582; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15583; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
15584; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
15585; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
15586; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15587; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
15588; SKIP-CACHE-INV-NEXT:    s_endpgm
15589;
15590; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15591; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15592; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15593; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15594; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15595; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15596; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15597; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15598; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15599; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15600; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15601; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
15602; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15603; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
15604; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15605;
15606; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15607; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15608; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15609; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15610; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15611; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15612; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15613; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15614; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15615; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15616; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15617; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
15618; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15619; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
15620; GFX90A-TGSPLIT-NEXT:    s_endpgm
15621;
15622; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15623; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15624; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15625; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15626; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15627; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15628; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15629; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15630; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15631; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15632; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15633; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
15634; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15635; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
15636; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15637;
15638; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15639; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15640; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15641; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15642; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15643; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15644; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15645; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15646; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15647; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15648; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15649; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
15650; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15651; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
15652; GFX940-TGSPLIT-NEXT:    s_endpgm
15653;
15654; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15655; GFX11-WGP:       ; %bb.0: ; %entry
15656; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
15657; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15658; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15659; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15660; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15661; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
15662; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
15663; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15664; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
15665; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
15666; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15667; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
15668; GFX11-WGP-NEXT:    s_endpgm
15669;
15670; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15671; GFX11-CU:       ; %bb.0: ; %entry
15672; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
15673; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15674; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15675; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15676; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15677; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
15678; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
15679; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15680; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
15681; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
15682; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
15683; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
15684; GFX11-CU-NEXT:    s_endpgm
15685;
15686; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15687; GFX12-WGP:       ; %bb.0: ; %entry
15688; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
15689; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15690; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15691; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15692; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15693; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
15694; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
15695; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15696; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
15697; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
15698; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15699; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
15700; GFX12-WGP-NEXT:    s_endpgm
15701;
15702; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
15703; GFX12-CU:       ; %bb.0: ; %entry
15704; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
15705; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15706; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15707; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15708; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15709; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
15710; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
15711; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15712; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
15713; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
15714; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
15715; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
15716; GFX12-CU-NEXT:    s_endpgm
15717    ptr addrspace(1) %out, i32 %in, i32 %old) {
15718entry:
15719  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
15720  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
15721  %val0 = extractvalue { i32, i1 } %val, 0
15722  store i32 %val0, ptr addrspace(1) %out, align 4
15723  ret void
15724}
15725
15726define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
15727; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15728; GFX6:       ; %bb.0: ; %entry
15729; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
15730; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15731; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
15732; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
15733; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15734; GFX6-NEXT:    s_mov_b32 s12, s5
15735; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
15736; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
15737; GFX6-NEXT:    s_mov_b32 s11, -1
15738; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
15739; GFX6-NEXT:    s_mov_b32 s5, s12
15740; GFX6-NEXT:    s_mov_b32 s6, s11
15741; GFX6-NEXT:    s_mov_b32 s7, s10
15742; GFX6-NEXT:    v_mov_b32_e32 v0, s9
15743; GFX6-NEXT:    v_mov_b32_e32 v2, s8
15744; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15745; GFX6-NEXT:    v_mov_b32_e32 v1, v2
15746; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
15747; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
15748; GFX6-NEXT:    s_waitcnt vmcnt(0)
15749; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
15750; GFX6-NEXT:    s_endpgm
15751;
15752; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15753; GFX7:       ; %bb.0: ; %entry
15754; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
15755; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15756; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
15757; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
15758; GFX7-NEXT:    s_mov_b64 s[12:13], 16
15759; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
15760; GFX7-NEXT:    s_mov_b32 s6, s4
15761; GFX7-NEXT:    s_mov_b32 s7, s5
15762; GFX7-NEXT:    s_mov_b32 s11, s12
15763; GFX7-NEXT:    s_mov_b32 s10, s13
15764; GFX7-NEXT:    s_add_u32 s6, s6, s11
15765; GFX7-NEXT:    s_addc_u32 s10, s7, s10
15766; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15767; GFX7-NEXT:    s_mov_b32 s7, s10
15768; GFX7-NEXT:    v_mov_b32_e32 v2, s9
15769; GFX7-NEXT:    v_mov_b32_e32 v0, s8
15770; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15771; GFX7-NEXT:    v_mov_b32_e32 v3, v0
15772; GFX7-NEXT:    v_mov_b32_e32 v0, s6
15773; GFX7-NEXT:    v_mov_b32_e32 v1, s7
15774; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15775; GFX7-NEXT:    v_mov_b32_e32 v0, s4
15776; GFX7-NEXT:    v_mov_b32_e32 v1, s5
15777; GFX7-NEXT:    s_waitcnt vmcnt(0)
15778; GFX7-NEXT:    flat_store_dword v[0:1], v2
15779; GFX7-NEXT:    s_endpgm
15780;
15781; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15782; GFX10-WGP:       ; %bb.0: ; %entry
15783; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
15784; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15785; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
15786; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
15787; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15788; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
15789; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
15790; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15791; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
15792; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
15793; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
15794; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
15795; GFX10-WGP-NEXT:    s_endpgm
15796;
15797; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15798; GFX10-CU:       ; %bb.0: ; %entry
15799; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
15800; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15801; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
15802; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
15803; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
15804; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
15805; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
15806; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15807; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
15808; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
15809; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
15810; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
15811; GFX10-CU-NEXT:    s_endpgm
15812;
15813; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15814; SKIP-CACHE-INV:       ; %bb.0: ; %entry
15815; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
15816; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
15817; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
15818; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
15819; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
15820; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
15821; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
15822; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
15823; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
15824; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
15825; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
15826; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
15827; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
15828; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
15829; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
15830; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15831; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
15832; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
15833; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
15834; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
15835; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
15836; SKIP-CACHE-INV-NEXT:    s_endpgm
15837;
15838; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15839; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
15840; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15841; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15842; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15843; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15844; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15845; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15846; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15847; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15848; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15849; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
15850; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15851; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
15852; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
15853;
15854; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15855; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
15856; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15857; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
15858; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
15859; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
15860; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15861; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
15862; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
15863; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15864; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15865; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
15866; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15867; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
15868; GFX90A-TGSPLIT-NEXT:    s_endpgm
15869;
15870; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15871; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
15872; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15873; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15874; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15875; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15876; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15877; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15878; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15879; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15880; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15881; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
15882; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15883; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
15884; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
15885;
15886; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15887; GFX940-TGSPLIT:       ; %bb.0: ; %entry
15888; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
15889; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
15890; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
15891; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
15892; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
15893; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
15894; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
15895; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15896; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
15897; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
15898; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
15899; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
15900; GFX940-TGSPLIT-NEXT:    s_endpgm
15901;
15902; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15903; GFX11-WGP:       ; %bb.0: ; %entry
15904; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
15905; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15906; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15907; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15908; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
15909; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
15910; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
15911; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15912; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
15913; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
15914; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
15915; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
15916; GFX11-WGP-NEXT:    s_endpgm
15917;
15918; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15919; GFX11-CU:       ; %bb.0: ; %entry
15920; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
15921; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15922; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15923; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15924; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
15925; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
15926; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
15927; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15928; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
15929; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
15930; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
15931; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
15932; GFX11-CU-NEXT:    s_endpgm
15933;
15934; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15935; GFX12-WGP:       ; %bb.0: ; %entry
15936; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
15937; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15938; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
15939; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
15940; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
15941; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
15942; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
15943; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15944; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
15945; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
15946; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
15947; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
15948; GFX12-WGP-NEXT:    s_endpgm
15949;
15950; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
15951; GFX12-CU:       ; %bb.0: ; %entry
15952; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
15953; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
15954; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
15955; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
15956; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
15957; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
15958; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
15959; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
15960; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
15961; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
15962; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
15963; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
15964; GFX12-CU-NEXT:    s_endpgm
15965    ptr addrspace(1) %out, i32 %in, i32 %old) {
15966entry:
15967  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
15968  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
15969  %val0 = extractvalue { i32, i1 } %val, 0
15970  store i32 %val0, ptr addrspace(1) %out, align 4
15971  ret void
15972}
15973
15974define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg(
15975; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
15976; GFX6:       ; %bb.0: ; %entry
15977; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
15978; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
15979; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
15980; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
15981; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15982; GFX6-NEXT:    s_mov_b32 s12, s5
15983; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
15984; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
15985; GFX6-NEXT:    s_mov_b32 s11, -1
15986; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
15987; GFX6-NEXT:    s_mov_b32 s5, s12
15988; GFX6-NEXT:    s_mov_b32 s6, s11
15989; GFX6-NEXT:    s_mov_b32 s7, s10
15990; GFX6-NEXT:    v_mov_b32_e32 v0, s9
15991; GFX6-NEXT:    v_mov_b32_e32 v2, s8
15992; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
15993; GFX6-NEXT:    v_mov_b32_e32 v1, v2
15994; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
15995; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
15996; GFX6-NEXT:    s_waitcnt vmcnt(0)
15997; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
15998; GFX6-NEXT:    s_endpgm
15999;
16000; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16001; GFX7:       ; %bb.0: ; %entry
16002; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16003; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16004; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16005; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16006; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16007; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16008; GFX7-NEXT:    s_mov_b32 s6, s4
16009; GFX7-NEXT:    s_mov_b32 s7, s5
16010; GFX7-NEXT:    s_mov_b32 s11, s12
16011; GFX7-NEXT:    s_mov_b32 s10, s13
16012; GFX7-NEXT:    s_add_u32 s6, s6, s11
16013; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16014; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16015; GFX7-NEXT:    s_mov_b32 s7, s10
16016; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16017; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16018; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16019; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16020; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16021; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16022; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16023; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16024; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16025; GFX7-NEXT:    s_waitcnt vmcnt(0)
16026; GFX7-NEXT:    flat_store_dword v[0:1], v2
16027; GFX7-NEXT:    s_endpgm
16028;
16029; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16030; GFX10-WGP:       ; %bb.0: ; %entry
16031; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
16032; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16033; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
16034; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
16035; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16036; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16037; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
16038; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16039; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
16040; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16041; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16042; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
16043; GFX10-WGP-NEXT:    s_endpgm
16044;
16045; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16046; GFX10-CU:       ; %bb.0: ; %entry
16047; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
16048; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16049; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
16050; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
16051; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16052; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16053; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
16054; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16055; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
16056; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16057; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16058; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
16059; GFX10-CU-NEXT:    s_endpgm
16060;
16061; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16062; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16063; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16064; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16065; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16066; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16067; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16068; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
16069; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
16070; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
16071; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
16072; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
16073; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
16074; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
16075; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16076; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
16077; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
16078; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16079; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
16080; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
16081; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16082; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16083; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16084; SKIP-CACHE-INV-NEXT:    s_endpgm
16085;
16086; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16087; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16088; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16089; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16090; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16091; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16092; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16093; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16094; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16095; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16096; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16097; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16098; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16099; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16100; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16101;
16102; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16103; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16104; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16105; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16106; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16107; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16108; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16109; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16110; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16111; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16112; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16113; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16114; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16115; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16116; GFX90A-TGSPLIT-NEXT:    s_endpgm
16117;
16118; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16119; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16120; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16121; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16122; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16123; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16124; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16125; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16126; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16127; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16128; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16129; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16130; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16131; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16132; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16133;
16134; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16135; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16136; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16137; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16138; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16139; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16140; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16141; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16142; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16143; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16144; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16145; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16146; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16147; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16148; GFX940-TGSPLIT-NEXT:    s_endpgm
16149;
16150; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16151; GFX11-WGP:       ; %bb.0: ; %entry
16152; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
16153; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16154; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16155; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16156; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16157; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
16158; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
16159; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16160; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
16161; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16162; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16163; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16164; GFX11-WGP-NEXT:    s_endpgm
16165;
16166; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16167; GFX11-CU:       ; %bb.0: ; %entry
16168; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
16169; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16170; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16171; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16172; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16173; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
16174; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
16175; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16176; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
16177; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16178; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16179; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16180; GFX11-CU-NEXT:    s_endpgm
16181;
16182; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16183; GFX12-WGP:       ; %bb.0: ; %entry
16184; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
16185; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16186; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16187; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16188; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16189; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
16190; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
16191; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16192; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
16193; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16194; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16195; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16196; GFX12-WGP-NEXT:    s_endpgm
16197;
16198; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
16199; GFX12-CU:       ; %bb.0: ; %entry
16200; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
16201; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16202; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16203; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16204; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16205; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
16206; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
16207; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16208; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
16209; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16210; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16211; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16212; GFX12-CU-NEXT:    s_endpgm
16213    ptr addrspace(1) %out, i32 %in, i32 %old) {
16214entry:
16215  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
16216  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
16217  %val0 = extractvalue { i32, i1 } %val, 0
16218  store i32 %val0, ptr addrspace(1) %out, align 4
16219  ret void
16220}
16221
16222define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
16223; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16224; GFX6:       ; %bb.0: ; %entry
16225; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
16226; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16227; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
16228; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
16229; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16230; GFX6-NEXT:    s_mov_b32 s12, s5
16231; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
16232; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
16233; GFX6-NEXT:    s_mov_b32 s11, -1
16234; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
16235; GFX6-NEXT:    s_mov_b32 s5, s12
16236; GFX6-NEXT:    s_mov_b32 s6, s11
16237; GFX6-NEXT:    s_mov_b32 s7, s10
16238; GFX6-NEXT:    v_mov_b32_e32 v0, s9
16239; GFX6-NEXT:    v_mov_b32_e32 v2, s8
16240; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16241; GFX6-NEXT:    v_mov_b32_e32 v1, v2
16242; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
16243; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16244; GFX6-NEXT:    s_waitcnt vmcnt(0)
16245; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
16246; GFX6-NEXT:    s_endpgm
16247;
16248; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16249; GFX7:       ; %bb.0: ; %entry
16250; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16251; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16252; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16253; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16254; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16256; GFX7-NEXT:    s_mov_b32 s6, s4
16257; GFX7-NEXT:    s_mov_b32 s7, s5
16258; GFX7-NEXT:    s_mov_b32 s11, s12
16259; GFX7-NEXT:    s_mov_b32 s10, s13
16260; GFX7-NEXT:    s_add_u32 s6, s6, s11
16261; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16262; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16263; GFX7-NEXT:    s_mov_b32 s7, s10
16264; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16265; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16266; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16267; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16268; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16269; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16270; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16271; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16272; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16273; GFX7-NEXT:    s_waitcnt vmcnt(0)
16274; GFX7-NEXT:    flat_store_dword v[0:1], v2
16275; GFX7-NEXT:    s_endpgm
16276;
16277; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16278; GFX10-WGP:       ; %bb.0: ; %entry
16279; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
16280; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16281; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
16282; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
16283; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16284; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16285; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
16286; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16287; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
16288; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16289; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16290; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
16291; GFX10-WGP-NEXT:    s_endpgm
16292;
16293; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16294; GFX10-CU:       ; %bb.0: ; %entry
16295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
16296; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16297; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
16298; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
16299; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16300; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16301; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
16302; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16303; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
16304; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16305; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16306; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
16307; GFX10-CU-NEXT:    s_endpgm
16308;
16309; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16310; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16311; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16312; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16313; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16314; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16315; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16316; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
16317; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
16318; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
16319; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
16320; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
16321; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
16322; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
16323; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16324; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
16325; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
16326; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16327; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
16328; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
16329; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16330; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16331; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16332; SKIP-CACHE-INV-NEXT:    s_endpgm
16333;
16334; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16335; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16336; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16337; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16338; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16339; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16340; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16341; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16342; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16343; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16344; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16345; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16346; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16347; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16348; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16349;
16350; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16351; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16352; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16353; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16354; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16355; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16356; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16357; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16358; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16359; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16360; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16361; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16362; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16363; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16364; GFX90A-TGSPLIT-NEXT:    s_endpgm
16365;
16366; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16367; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16368; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16369; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16370; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16371; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16372; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16373; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16374; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16375; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16376; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16377; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16378; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16379; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16380; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16381;
16382; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16383; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16384; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16385; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16386; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16387; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16388; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16389; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16390; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16391; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16392; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16393; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16394; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16395; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16396; GFX940-TGSPLIT-NEXT:    s_endpgm
16397;
16398; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16399; GFX11-WGP:       ; %bb.0: ; %entry
16400; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
16401; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16402; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16403; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16404; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16405; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
16406; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
16407; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16408; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
16409; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16410; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16411; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16412; GFX11-WGP-NEXT:    s_endpgm
16413;
16414; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16415; GFX11-CU:       ; %bb.0: ; %entry
16416; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
16417; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16418; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16419; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16420; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16421; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
16422; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
16423; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16424; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
16425; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16426; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16427; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16428; GFX11-CU-NEXT:    s_endpgm
16429;
16430; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16431; GFX12-WGP:       ; %bb.0: ; %entry
16432; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
16433; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16434; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16435; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16436; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16437; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
16438; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
16439; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16440; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
16441; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16442; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16443; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16444; GFX12-WGP-NEXT:    s_endpgm
16445;
16446; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
16447; GFX12-CU:       ; %bb.0: ; %entry
16448; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
16449; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16450; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16451; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16452; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16453; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
16454; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
16455; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16456; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
16457; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16458; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16459; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16460; GFX12-CU-NEXT:    s_endpgm
16461    ptr addrspace(1) %out, i32 %in, i32 %old) {
16462entry:
16463  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
16464  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
16465  %val0 = extractvalue { i32, i1 } %val, 0
16466  store i32 %val0, ptr addrspace(1) %out, align 4
16467  ret void
16468}
16469
16470define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
16471; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16472; GFX6:       ; %bb.0: ; %entry
16473; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
16474; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16475; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
16476; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
16477; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16478; GFX6-NEXT:    s_mov_b32 s12, s5
16479; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
16480; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
16481; GFX6-NEXT:    s_mov_b32 s11, -1
16482; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
16483; GFX6-NEXT:    s_mov_b32 s5, s12
16484; GFX6-NEXT:    s_mov_b32 s6, s11
16485; GFX6-NEXT:    s_mov_b32 s7, s10
16486; GFX6-NEXT:    v_mov_b32_e32 v0, s9
16487; GFX6-NEXT:    v_mov_b32_e32 v2, s8
16488; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16489; GFX6-NEXT:    v_mov_b32_e32 v1, v2
16490; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
16491; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16492; GFX6-NEXT:    s_waitcnt vmcnt(0)
16493; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
16494; GFX6-NEXT:    s_endpgm
16495;
16496; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16497; GFX7:       ; %bb.0: ; %entry
16498; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16499; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16500; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16501; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16502; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16503; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16504; GFX7-NEXT:    s_mov_b32 s6, s4
16505; GFX7-NEXT:    s_mov_b32 s7, s5
16506; GFX7-NEXT:    s_mov_b32 s11, s12
16507; GFX7-NEXT:    s_mov_b32 s10, s13
16508; GFX7-NEXT:    s_add_u32 s6, s6, s11
16509; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16510; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16511; GFX7-NEXT:    s_mov_b32 s7, s10
16512; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16513; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16514; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16515; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16516; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16517; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16518; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16519; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16520; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16521; GFX7-NEXT:    s_waitcnt vmcnt(0)
16522; GFX7-NEXT:    flat_store_dword v[0:1], v2
16523; GFX7-NEXT:    s_endpgm
16524;
16525; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16526; GFX10-WGP:       ; %bb.0: ; %entry
16527; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
16528; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16529; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
16530; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
16531; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16532; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16533; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
16534; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16535; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
16536; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16537; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16538; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
16539; GFX10-WGP-NEXT:    s_endpgm
16540;
16541; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16542; GFX10-CU:       ; %bb.0: ; %entry
16543; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
16544; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16545; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
16546; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
16547; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16548; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16549; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
16550; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16551; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
16552; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16553; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16554; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
16555; GFX10-CU-NEXT:    s_endpgm
16556;
16557; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16558; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16559; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16560; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16561; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16562; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16563; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16564; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
16565; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
16566; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
16567; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
16568; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
16569; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
16570; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
16571; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16572; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
16573; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
16574; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16575; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
16576; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
16577; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16578; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16579; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16580; SKIP-CACHE-INV-NEXT:    s_endpgm
16581;
16582; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16583; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16584; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16585; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16586; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16587; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16588; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16589; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16590; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16591; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16592; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16593; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16594; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16595; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16596; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16597;
16598; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16599; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16600; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16601; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16602; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16603; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16604; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16605; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16606; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16607; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16608; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16609; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16610; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16611; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16612; GFX90A-TGSPLIT-NEXT:    s_endpgm
16613;
16614; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16615; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16616; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16617; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16618; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16619; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16620; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16621; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16622; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16623; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16624; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16625; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16626; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16627; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16628; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16629;
16630; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16631; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16632; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16633; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16634; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16635; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16636; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16637; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16638; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16639; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16640; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16641; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16642; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16643; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16644; GFX940-TGSPLIT-NEXT:    s_endpgm
16645;
16646; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16647; GFX11-WGP:       ; %bb.0: ; %entry
16648; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
16649; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16650; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16651; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16652; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16653; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
16654; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
16655; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16656; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
16657; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16658; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16659; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16660; GFX11-WGP-NEXT:    s_endpgm
16661;
16662; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16663; GFX11-CU:       ; %bb.0: ; %entry
16664; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
16665; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16666; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16667; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16668; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16669; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
16670; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
16671; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16672; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
16673; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16674; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16675; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16676; GFX11-CU-NEXT:    s_endpgm
16677;
16678; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16679; GFX12-WGP:       ; %bb.0: ; %entry
16680; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
16681; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16682; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16683; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16684; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16685; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
16686; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
16687; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16688; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
16689; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16690; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16691; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16692; GFX12-WGP-NEXT:    s_endpgm
16693;
16694; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
16695; GFX12-CU:       ; %bb.0: ; %entry
16696; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
16697; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16698; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16699; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16700; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16701; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
16702; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
16703; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16704; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
16705; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16706; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16707; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16708; GFX12-CU-NEXT:    s_endpgm
16709    ptr addrspace(1) %out, i32 %in, i32 %old) {
16710entry:
16711  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
16712  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
16713  %val0 = extractvalue { i32, i1 } %val, 0
16714  store i32 %val0, ptr addrspace(1) %out, align 4
16715  ret void
16716}
16717
16718define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
16719; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16720; GFX6:       ; %bb.0: ; %entry
16721; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
16722; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16723; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
16724; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
16725; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16726; GFX6-NEXT:    s_mov_b32 s12, s5
16727; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
16728; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
16729; GFX6-NEXT:    s_mov_b32 s11, -1
16730; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
16731; GFX6-NEXT:    s_mov_b32 s5, s12
16732; GFX6-NEXT:    s_mov_b32 s6, s11
16733; GFX6-NEXT:    s_mov_b32 s7, s10
16734; GFX6-NEXT:    v_mov_b32_e32 v0, s9
16735; GFX6-NEXT:    v_mov_b32_e32 v2, s8
16736; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16737; GFX6-NEXT:    v_mov_b32_e32 v1, v2
16738; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
16739; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16740; GFX6-NEXT:    s_waitcnt vmcnt(0)
16741; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
16742; GFX6-NEXT:    s_endpgm
16743;
16744; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16745; GFX7:       ; %bb.0: ; %entry
16746; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16747; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16748; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16749; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16750; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16751; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16752; GFX7-NEXT:    s_mov_b32 s6, s4
16753; GFX7-NEXT:    s_mov_b32 s7, s5
16754; GFX7-NEXT:    s_mov_b32 s11, s12
16755; GFX7-NEXT:    s_mov_b32 s10, s13
16756; GFX7-NEXT:    s_add_u32 s6, s6, s11
16757; GFX7-NEXT:    s_addc_u32 s10, s7, s10
16758; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16759; GFX7-NEXT:    s_mov_b32 s7, s10
16760; GFX7-NEXT:    v_mov_b32_e32 v2, s9
16761; GFX7-NEXT:    v_mov_b32_e32 v0, s8
16762; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16763; GFX7-NEXT:    v_mov_b32_e32 v3, v0
16764; GFX7-NEXT:    v_mov_b32_e32 v0, s6
16765; GFX7-NEXT:    v_mov_b32_e32 v1, s7
16766; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16767; GFX7-NEXT:    v_mov_b32_e32 v0, s4
16768; GFX7-NEXT:    v_mov_b32_e32 v1, s5
16769; GFX7-NEXT:    s_waitcnt vmcnt(0)
16770; GFX7-NEXT:    flat_store_dword v[0:1], v2
16771; GFX7-NEXT:    s_endpgm
16772;
16773; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16774; GFX10-WGP:       ; %bb.0: ; %entry
16775; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
16776; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16777; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
16778; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
16779; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16780; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
16781; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
16782; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16783; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
16784; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16785; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
16786; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
16787; GFX10-WGP-NEXT:    s_endpgm
16788;
16789; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16790; GFX10-CU:       ; %bb.0: ; %entry
16791; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
16792; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16793; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
16794; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
16795; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
16796; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
16797; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
16798; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16799; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
16800; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
16801; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
16802; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
16803; GFX10-CU-NEXT:    s_endpgm
16804;
16805; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16806; SKIP-CACHE-INV:       ; %bb.0: ; %entry
16807; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
16808; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
16809; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
16810; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
16811; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
16812; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
16813; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
16814; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
16815; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
16816; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
16817; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
16818; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
16819; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
16820; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
16821; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
16822; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16823; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
16824; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
16825; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16826; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
16827; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16828; SKIP-CACHE-INV-NEXT:    s_endpgm
16829;
16830; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16831; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
16832; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16833; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16834; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16835; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16836; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16837; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16838; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16839; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16840; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16841; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16842; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16843; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16844; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
16845;
16846; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16847; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
16848; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16849; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
16850; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
16851; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
16852; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16853; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
16854; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
16855; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16856; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16857; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
16858; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16859; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
16860; GFX90A-TGSPLIT-NEXT:    s_endpgm
16861;
16862; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16863; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
16864; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16865; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16866; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16867; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16868; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16869; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16870; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16871; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16872; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16873; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16874; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16875; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16876; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
16877;
16878; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16879; GFX940-TGSPLIT:       ; %bb.0: ; %entry
16880; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
16881; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16882; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
16883; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
16884; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
16885; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
16886; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
16887; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16888; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
16889; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
16890; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
16891; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
16892; GFX940-TGSPLIT-NEXT:    s_endpgm
16893;
16894; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16895; GFX11-WGP:       ; %bb.0: ; %entry
16896; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
16897; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16898; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16899; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16900; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
16901; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
16902; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
16903; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16904; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
16905; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16906; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
16907; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16908; GFX11-WGP-NEXT:    s_endpgm
16909;
16910; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16911; GFX11-CU:       ; %bb.0: ; %entry
16912; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
16913; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16914; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16915; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16916; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
16917; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
16918; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
16919; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16920; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
16921; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
16922; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
16923; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16924; GFX11-CU-NEXT:    s_endpgm
16925;
16926; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16927; GFX12-WGP:       ; %bb.0: ; %entry
16928; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
16929; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16930; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
16931; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
16932; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
16933; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
16934; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
16935; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16936; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
16937; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16938; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
16939; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
16940; GFX12-WGP-NEXT:    s_endpgm
16941;
16942; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
16943; GFX12-CU:       ; %bb.0: ; %entry
16944; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
16945; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
16946; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
16947; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
16948; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
16949; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
16950; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
16951; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
16952; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
16953; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
16954; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
16955; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
16956; GFX12-CU-NEXT:    s_endpgm
16957    ptr addrspace(1) %out, i32 %in, i32 %old) {
16958entry:
16959  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
16960  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
16961  %val0 = extractvalue { i32, i1 } %val, 0
16962  store i32 %val0, ptr addrspace(1) %out, align 4
16963  ret void
16964}
16965
16966define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
16967; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16968; GFX6:       ; %bb.0: ; %entry
16969; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
16970; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16971; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
16972; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
16973; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16974; GFX6-NEXT:    s_mov_b32 s12, s5
16975; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
16976; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
16977; GFX6-NEXT:    s_mov_b32 s11, -1
16978; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
16979; GFX6-NEXT:    s_mov_b32 s5, s12
16980; GFX6-NEXT:    s_mov_b32 s6, s11
16981; GFX6-NEXT:    s_mov_b32 s7, s10
16982; GFX6-NEXT:    v_mov_b32_e32 v0, s9
16983; GFX6-NEXT:    v_mov_b32_e32 v2, s8
16984; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
16985; GFX6-NEXT:    v_mov_b32_e32 v1, v2
16986; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
16987; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
16988; GFX6-NEXT:    s_waitcnt vmcnt(0)
16989; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
16990; GFX6-NEXT:    s_endpgm
16991;
16992; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
16993; GFX7:       ; %bb.0: ; %entry
16994; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
16995; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
16996; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
16997; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
16998; GFX7-NEXT:    s_mov_b64 s[12:13], 16
16999; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17000; GFX7-NEXT:    s_mov_b32 s6, s4
17001; GFX7-NEXT:    s_mov_b32 s7, s5
17002; GFX7-NEXT:    s_mov_b32 s11, s12
17003; GFX7-NEXT:    s_mov_b32 s10, s13
17004; GFX7-NEXT:    s_add_u32 s6, s6, s11
17005; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17006; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17007; GFX7-NEXT:    s_mov_b32 s7, s10
17008; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17009; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17010; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17011; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17012; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17013; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17014; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17015; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17016; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17017; GFX7-NEXT:    s_waitcnt vmcnt(0)
17018; GFX7-NEXT:    flat_store_dword v[0:1], v2
17019; GFX7-NEXT:    s_endpgm
17020;
17021; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17022; GFX10-WGP:       ; %bb.0: ; %entry
17023; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
17024; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17025; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
17026; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
17027; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17028; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17029; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
17030; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17031; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
17032; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17033; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17034; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
17035; GFX10-WGP-NEXT:    s_endpgm
17036;
17037; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17038; GFX10-CU:       ; %bb.0: ; %entry
17039; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
17040; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17041; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
17042; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
17043; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17044; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17045; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
17046; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17047; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
17048; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17049; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17050; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
17051; GFX10-CU-NEXT:    s_endpgm
17052;
17053; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17054; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17055; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17056; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17057; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17058; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17059; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17060; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
17061; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
17062; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
17063; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
17064; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
17065; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
17066; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
17067; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
17069; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
17070; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17071; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
17072; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
17073; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17074; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17075; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17076; SKIP-CACHE-INV-NEXT:    s_endpgm
17077;
17078; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17079; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17080; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17081; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17082; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17083; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17084; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17085; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17086; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17087; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17088; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17089; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17090; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17091; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17092; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17093;
17094; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17095; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17097; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17098; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17099; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17100; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17101; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17102; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17103; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17104; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17105; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17106; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17107; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17108; GFX90A-TGSPLIT-NEXT:    s_endpgm
17109;
17110; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17111; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17112; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17113; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17114; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17115; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17116; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17117; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17118; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17119; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17120; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17121; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17122; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17123; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17124; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17125;
17126; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17127; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17128; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17129; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17130; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17131; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17132; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17133; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17134; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17135; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17136; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17137; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17138; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17139; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17140; GFX940-TGSPLIT-NEXT:    s_endpgm
17141;
17142; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17143; GFX11-WGP:       ; %bb.0: ; %entry
17144; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
17145; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17146; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17147; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17148; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17149; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
17150; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
17151; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17152; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
17153; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17154; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17155; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17156; GFX11-WGP-NEXT:    s_endpgm
17157;
17158; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17159; GFX11-CU:       ; %bb.0: ; %entry
17160; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
17161; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17162; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17163; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17164; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17165; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
17166; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
17167; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17168; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
17169; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17170; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17171; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17172; GFX11-CU-NEXT:    s_endpgm
17173;
17174; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17175; GFX12-WGP:       ; %bb.0: ; %entry
17176; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
17177; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17178; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17179; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17180; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17181; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
17182; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
17183; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17184; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
17185; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17186; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17187; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17188; GFX12-WGP-NEXT:    s_endpgm
17189;
17190; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
17191; GFX12-CU:       ; %bb.0: ; %entry
17192; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
17193; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17194; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17195; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17196; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17197; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
17198; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
17199; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17200; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
17201; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17202; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17203; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17204; GFX12-CU-NEXT:    s_endpgm
17205    ptr addrspace(1) %out, i32 %in, i32 %old) {
17206entry:
17207  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
17208  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
17209  %val0 = extractvalue { i32, i1 } %val, 0
17210  store i32 %val0, ptr addrspace(1) %out, align 4
17211  ret void
17212}
17213
17214define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
17215; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17216; GFX6:       ; %bb.0: ; %entry
17217; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
17218; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17219; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
17220; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
17221; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17222; GFX6-NEXT:    s_mov_b32 s12, s5
17223; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
17224; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
17225; GFX6-NEXT:    s_mov_b32 s11, -1
17226; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
17227; GFX6-NEXT:    s_mov_b32 s5, s12
17228; GFX6-NEXT:    s_mov_b32 s6, s11
17229; GFX6-NEXT:    s_mov_b32 s7, s10
17230; GFX6-NEXT:    v_mov_b32_e32 v0, s9
17231; GFX6-NEXT:    v_mov_b32_e32 v2, s8
17232; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17233; GFX6-NEXT:    v_mov_b32_e32 v1, v2
17234; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
17235; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17236; GFX6-NEXT:    s_waitcnt vmcnt(0)
17237; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17238; GFX6-NEXT:    s_endpgm
17239;
17240; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17241; GFX7:       ; %bb.0: ; %entry
17242; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17243; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17244; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17245; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17246; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17247; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17248; GFX7-NEXT:    s_mov_b32 s6, s4
17249; GFX7-NEXT:    s_mov_b32 s7, s5
17250; GFX7-NEXT:    s_mov_b32 s11, s12
17251; GFX7-NEXT:    s_mov_b32 s10, s13
17252; GFX7-NEXT:    s_add_u32 s6, s6, s11
17253; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17254; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17255; GFX7-NEXT:    s_mov_b32 s7, s10
17256; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17257; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17258; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17259; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17260; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17261; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17262; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17263; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17264; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17265; GFX7-NEXT:    s_waitcnt vmcnt(0)
17266; GFX7-NEXT:    flat_store_dword v[0:1], v2
17267; GFX7-NEXT:    s_endpgm
17268;
17269; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17270; GFX10-WGP:       ; %bb.0: ; %entry
17271; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
17272; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17273; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
17274; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
17275; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17276; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17277; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
17278; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17279; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
17280; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17281; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17282; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
17283; GFX10-WGP-NEXT:    s_endpgm
17284;
17285; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17286; GFX10-CU:       ; %bb.0: ; %entry
17287; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
17288; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17289; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
17290; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
17291; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17292; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17293; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
17294; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17295; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
17296; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17297; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17298; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
17299; GFX10-CU-NEXT:    s_endpgm
17300;
17301; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17302; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17303; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17304; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17305; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17306; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17307; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
17309; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
17310; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
17311; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
17312; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
17313; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
17314; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
17315; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17316; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
17317; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
17318; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17319; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
17320; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
17321; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17322; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17323; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17324; SKIP-CACHE-INV-NEXT:    s_endpgm
17325;
17326; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17327; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17328; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17329; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17330; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17331; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17332; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17333; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17334; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17335; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17336; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17337; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17338; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17339; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17340; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17341;
17342; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17343; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17344; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17345; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17346; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17347; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17348; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17349; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17350; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17351; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17352; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17353; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17354; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17355; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17356; GFX90A-TGSPLIT-NEXT:    s_endpgm
17357;
17358; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17359; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17360; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17361; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17362; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17363; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17364; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17365; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17366; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17367; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17368; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17369; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17370; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17371; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17372; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17373;
17374; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17375; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17376; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17377; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17378; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17379; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17380; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17381; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17382; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17383; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17384; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17385; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17386; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17387; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17388; GFX940-TGSPLIT-NEXT:    s_endpgm
17389;
17390; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17391; GFX11-WGP:       ; %bb.0: ; %entry
17392; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
17393; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17394; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17395; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17396; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17397; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
17398; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
17399; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17400; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
17401; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17402; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17403; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17404; GFX11-WGP-NEXT:    s_endpgm
17405;
17406; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17407; GFX11-CU:       ; %bb.0: ; %entry
17408; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
17409; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17410; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17411; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17412; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17413; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
17414; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
17415; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17416; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
17417; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17418; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17419; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17420; GFX11-CU-NEXT:    s_endpgm
17421;
17422; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17423; GFX12-WGP:       ; %bb.0: ; %entry
17424; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
17425; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17426; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17427; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17428; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17429; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
17430; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
17431; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17432; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
17433; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17434; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17435; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17436; GFX12-WGP-NEXT:    s_endpgm
17437;
17438; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
17439; GFX12-CU:       ; %bb.0: ; %entry
17440; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
17441; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17442; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17443; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17444; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17445; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
17446; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
17447; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17448; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
17449; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17450; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17451; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17452; GFX12-CU-NEXT:    s_endpgm
17453    ptr addrspace(1) %out, i32 %in, i32 %old) {
17454entry:
17455  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
17456  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
17457  %val0 = extractvalue { i32, i1 } %val, 0
17458  store i32 %val0, ptr addrspace(1) %out, align 4
17459  ret void
17460}
17461
17462define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
17463; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17464; GFX6:       ; %bb.0: ; %entry
17465; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
17466; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17467; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
17468; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
17469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17470; GFX6-NEXT:    s_mov_b32 s12, s5
17471; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
17472; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
17473; GFX6-NEXT:    s_mov_b32 s11, -1
17474; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
17475; GFX6-NEXT:    s_mov_b32 s5, s12
17476; GFX6-NEXT:    s_mov_b32 s6, s11
17477; GFX6-NEXT:    s_mov_b32 s7, s10
17478; GFX6-NEXT:    v_mov_b32_e32 v0, s9
17479; GFX6-NEXT:    v_mov_b32_e32 v2, s8
17480; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17481; GFX6-NEXT:    v_mov_b32_e32 v1, v2
17482; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
17483; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17484; GFX6-NEXT:    s_waitcnt vmcnt(0)
17485; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17486; GFX6-NEXT:    s_endpgm
17487;
17488; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17489; GFX7:       ; %bb.0: ; %entry
17490; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17491; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17492; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17493; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17494; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17495; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17496; GFX7-NEXT:    s_mov_b32 s6, s4
17497; GFX7-NEXT:    s_mov_b32 s7, s5
17498; GFX7-NEXT:    s_mov_b32 s11, s12
17499; GFX7-NEXT:    s_mov_b32 s10, s13
17500; GFX7-NEXT:    s_add_u32 s6, s6, s11
17501; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17502; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17503; GFX7-NEXT:    s_mov_b32 s7, s10
17504; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17505; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17506; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17507; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17508; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17509; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17510; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17511; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17512; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17513; GFX7-NEXT:    s_waitcnt vmcnt(0)
17514; GFX7-NEXT:    flat_store_dword v[0:1], v2
17515; GFX7-NEXT:    s_endpgm
17516;
17517; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17518; GFX10-WGP:       ; %bb.0: ; %entry
17519; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
17520; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17521; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
17522; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
17523; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17524; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17525; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
17526; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17527; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
17528; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17529; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17530; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
17531; GFX10-WGP-NEXT:    s_endpgm
17532;
17533; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17534; GFX10-CU:       ; %bb.0: ; %entry
17535; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
17536; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17537; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
17538; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
17539; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17540; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17541; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
17542; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17543; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
17544; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17545; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17546; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
17547; GFX10-CU-NEXT:    s_endpgm
17548;
17549; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17550; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17551; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17552; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17553; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17554; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17555; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17556; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
17557; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
17558; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
17559; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
17560; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
17561; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
17562; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
17563; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17564; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
17565; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
17566; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17567; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
17568; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
17569; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17570; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17571; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17572; SKIP-CACHE-INV-NEXT:    s_endpgm
17573;
17574; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17575; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17576; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17577; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17578; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17579; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17580; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17581; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17582; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17583; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17584; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17585; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17586; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17587; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17588; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17589;
17590; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17591; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17592; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17593; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17594; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17595; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17596; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17597; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17598; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17599; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17600; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17601; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17602; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17603; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17604; GFX90A-TGSPLIT-NEXT:    s_endpgm
17605;
17606; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17607; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17608; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17609; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17610; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17611; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17612; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17613; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17614; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17615; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17616; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17617; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17618; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17619; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17620; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17621;
17622; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17623; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17624; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17625; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17626; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17627; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17628; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17629; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17630; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17631; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17632; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17633; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17634; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17635; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17636; GFX940-TGSPLIT-NEXT:    s_endpgm
17637;
17638; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17639; GFX11-WGP:       ; %bb.0: ; %entry
17640; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
17641; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17642; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17643; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17644; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17645; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
17646; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
17647; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17648; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
17649; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17650; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17651; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17652; GFX11-WGP-NEXT:    s_endpgm
17653;
17654; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17655; GFX11-CU:       ; %bb.0: ; %entry
17656; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
17657; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17658; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17659; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17660; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17661; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
17662; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
17663; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17664; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
17665; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17666; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17667; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17668; GFX11-CU-NEXT:    s_endpgm
17669;
17670; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17671; GFX12-WGP:       ; %bb.0: ; %entry
17672; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
17673; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17674; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17675; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17676; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17677; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
17678; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
17679; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17680; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
17681; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17682; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17683; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17684; GFX12-WGP-NEXT:    s_endpgm
17685;
17686; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
17687; GFX12-CU:       ; %bb.0: ; %entry
17688; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
17689; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17690; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17691; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17692; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17693; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
17694; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
17695; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17696; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
17697; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17698; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17699; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17700; GFX12-CU-NEXT:    s_endpgm
17701    ptr addrspace(1) %out, i32 %in, i32 %old) {
17702entry:
17703  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
17704  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
17705  %val0 = extractvalue { i32, i1 } %val, 0
17706  store i32 %val0, ptr addrspace(1) %out, align 4
17707  ret void
17708}
17709
17710define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
17711; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17712; GFX6:       ; %bb.0: ; %entry
17713; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
17714; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17715; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
17716; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
17717; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17718; GFX6-NEXT:    s_mov_b32 s12, s5
17719; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
17720; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
17721; GFX6-NEXT:    s_mov_b32 s11, -1
17722; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
17723; GFX6-NEXT:    s_mov_b32 s5, s12
17724; GFX6-NEXT:    s_mov_b32 s6, s11
17725; GFX6-NEXT:    s_mov_b32 s7, s10
17726; GFX6-NEXT:    v_mov_b32_e32 v0, s9
17727; GFX6-NEXT:    v_mov_b32_e32 v2, s8
17728; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17729; GFX6-NEXT:    v_mov_b32_e32 v1, v2
17730; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
17731; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17732; GFX6-NEXT:    s_waitcnt vmcnt(0)
17733; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17734; GFX6-NEXT:    s_endpgm
17735;
17736; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17737; GFX7:       ; %bb.0: ; %entry
17738; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17739; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17740; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17741; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17742; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17743; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17744; GFX7-NEXT:    s_mov_b32 s6, s4
17745; GFX7-NEXT:    s_mov_b32 s7, s5
17746; GFX7-NEXT:    s_mov_b32 s11, s12
17747; GFX7-NEXT:    s_mov_b32 s10, s13
17748; GFX7-NEXT:    s_add_u32 s6, s6, s11
17749; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17750; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17751; GFX7-NEXT:    s_mov_b32 s7, s10
17752; GFX7-NEXT:    v_mov_b32_e32 v2, s9
17753; GFX7-NEXT:    v_mov_b32_e32 v0, s8
17754; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17755; GFX7-NEXT:    v_mov_b32_e32 v3, v0
17756; GFX7-NEXT:    v_mov_b32_e32 v0, s6
17757; GFX7-NEXT:    v_mov_b32_e32 v1, s7
17758; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17759; GFX7-NEXT:    v_mov_b32_e32 v0, s4
17760; GFX7-NEXT:    v_mov_b32_e32 v1, s5
17761; GFX7-NEXT:    s_waitcnt vmcnt(0)
17762; GFX7-NEXT:    flat_store_dword v[0:1], v2
17763; GFX7-NEXT:    s_endpgm
17764;
17765; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17766; GFX10-WGP:       ; %bb.0: ; %entry
17767; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
17768; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17769; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
17770; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
17771; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17772; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
17773; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
17774; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17775; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
17776; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17777; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
17778; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
17779; GFX10-WGP-NEXT:    s_endpgm
17780;
17781; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17782; GFX10-CU:       ; %bb.0: ; %entry
17783; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
17784; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17785; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
17786; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
17787; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
17788; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
17789; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
17790; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17791; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
17792; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
17793; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
17794; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
17795; GFX10-CU-NEXT:    s_endpgm
17796;
17797; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17798; SKIP-CACHE-INV:       ; %bb.0: ; %entry
17799; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
17800; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
17801; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
17802; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
17803; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
17804; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
17805; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
17806; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
17807; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
17808; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
17809; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
17810; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
17811; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
17812; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
17813; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
17814; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17815; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
17816; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
17817; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17818; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
17819; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17820; SKIP-CACHE-INV-NEXT:    s_endpgm
17821;
17822; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17823; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
17824; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17825; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17826; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17827; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17828; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17829; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17830; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17831; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17832; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17833; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17834; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17835; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17836; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
17837;
17838; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17839; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
17840; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17841; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
17842; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
17843; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
17844; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17845; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
17846; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
17847; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17848; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17849; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
17850; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17851; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
17852; GFX90A-TGSPLIT-NEXT:    s_endpgm
17853;
17854; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17855; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
17856; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17857; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17858; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17859; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17860; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17861; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17862; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17863; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17864; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17865; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17866; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17867; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17868; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
17869;
17870; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17871; GFX940-TGSPLIT:       ; %bb.0: ; %entry
17872; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
17873; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
17874; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
17875; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
17876; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
17877; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
17878; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
17879; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17880; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
17881; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
17882; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
17883; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
17884; GFX940-TGSPLIT-NEXT:    s_endpgm
17885;
17886; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17887; GFX11-WGP:       ; %bb.0: ; %entry
17888; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
17889; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17890; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17891; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17892; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
17893; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
17894; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
17895; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17896; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
17897; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17898; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
17899; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17900; GFX11-WGP-NEXT:    s_endpgm
17901;
17902; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17903; GFX11-CU:       ; %bb.0: ; %entry
17904; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
17905; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17906; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17907; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17908; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
17909; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
17910; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
17911; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17912; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
17913; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
17914; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
17915; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17916; GFX11-CU-NEXT:    s_endpgm
17917;
17918; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17919; GFX12-WGP:       ; %bb.0: ; %entry
17920; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
17921; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17922; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
17923; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
17924; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
17925; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
17926; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
17927; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17928; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
17929; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17930; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
17931; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
17932; GFX12-WGP-NEXT:    s_endpgm
17933;
17934; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
17935; GFX12-CU:       ; %bb.0: ; %entry
17936; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
17937; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
17938; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
17939; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
17940; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
17941; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
17942; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
17943; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
17944; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
17945; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
17946; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
17947; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
17948; GFX12-CU-NEXT:    s_endpgm
17949    ptr addrspace(1) %out, i32 %in, i32 %old) {
17950entry:
17951  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
17952  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
17953  %val0 = extractvalue { i32, i1 } %val, 0
17954  store i32 %val0, ptr addrspace(1) %out, align 4
17955  ret void
17956}
17957
17958define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
17959; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
17960; GFX6:       ; %bb.0: ; %entry
17961; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
17962; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17963; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
17964; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
17965; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17966; GFX6-NEXT:    s_mov_b32 s12, s5
17967; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
17968; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
17969; GFX6-NEXT:    s_mov_b32 s11, -1
17970; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
17971; GFX6-NEXT:    s_mov_b32 s5, s12
17972; GFX6-NEXT:    s_mov_b32 s6, s11
17973; GFX6-NEXT:    s_mov_b32 s7, s10
17974; GFX6-NEXT:    v_mov_b32_e32 v0, s9
17975; GFX6-NEXT:    v_mov_b32_e32 v2, s8
17976; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
17977; GFX6-NEXT:    v_mov_b32_e32 v1, v2
17978; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
17979; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
17980; GFX6-NEXT:    s_waitcnt vmcnt(0)
17981; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17982; GFX6-NEXT:    s_endpgm
17983;
17984; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
17985; GFX7:       ; %bb.0: ; %entry
17986; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
17987; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
17988; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
17989; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
17990; GFX7-NEXT:    s_mov_b64 s[12:13], 16
17991; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
17992; GFX7-NEXT:    s_mov_b32 s6, s4
17993; GFX7-NEXT:    s_mov_b32 s7, s5
17994; GFX7-NEXT:    s_mov_b32 s11, s12
17995; GFX7-NEXT:    s_mov_b32 s10, s13
17996; GFX7-NEXT:    s_add_u32 s6, s6, s11
17997; GFX7-NEXT:    s_addc_u32 s10, s7, s10
17998; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17999; GFX7-NEXT:    s_mov_b32 s7, s10
18000; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18001; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18002; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18003; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18004; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18005; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18006; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18007; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18008; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18009; GFX7-NEXT:    s_waitcnt vmcnt(0)
18010; GFX7-NEXT:    flat_store_dword v[0:1], v2
18011; GFX7-NEXT:    s_endpgm
18012;
18013; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18014; GFX10-WGP:       ; %bb.0: ; %entry
18015; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
18016; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18017; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
18018; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
18019; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18020; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18021; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
18022; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18023; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
18024; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18025; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18026; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
18027; GFX10-WGP-NEXT:    s_endpgm
18028;
18029; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18030; GFX10-CU:       ; %bb.0: ; %entry
18031; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
18032; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18033; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
18034; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
18035; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18036; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18037; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
18038; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18039; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
18040; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18041; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18042; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
18043; GFX10-CU-NEXT:    s_endpgm
18044;
18045; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18046; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18047; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18048; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18049; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18050; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18051; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18052; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
18053; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
18054; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
18055; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
18056; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
18057; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
18058; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
18059; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
18061; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
18062; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
18064; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
18065; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18066; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18067; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18068; SKIP-CACHE-INV-NEXT:    s_endpgm
18069;
18070; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18071; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18072; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18073; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18074; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18075; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18076; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18077; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18078; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18079; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18080; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18081; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18082; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18083; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18084; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18085;
18086; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18087; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18088; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18089; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18090; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18091; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18092; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18093; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18094; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18095; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18096; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18097; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18098; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18099; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18100; GFX90A-TGSPLIT-NEXT:    s_endpgm
18101;
18102; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18103; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18104; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18105; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18106; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18107; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18108; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18109; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18110; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18111; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18112; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18113; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18114; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18115; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18116; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18117;
18118; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18119; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18120; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18121; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18122; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18123; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18124; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18125; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18126; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18127; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18128; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18129; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18130; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18131; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18132; GFX940-TGSPLIT-NEXT:    s_endpgm
18133;
18134; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18135; GFX11-WGP:       ; %bb.0: ; %entry
18136; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
18137; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18138; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18139; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18140; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18141; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
18142; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
18143; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18144; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
18145; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18146; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18147; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18148; GFX11-WGP-NEXT:    s_endpgm
18149;
18150; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18151; GFX11-CU:       ; %bb.0: ; %entry
18152; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
18153; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18154; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18155; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18156; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18157; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
18158; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
18159; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18160; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
18161; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18162; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18163; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18164; GFX11-CU-NEXT:    s_endpgm
18165;
18166; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18167; GFX12-WGP:       ; %bb.0: ; %entry
18168; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
18169; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18170; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18171; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18172; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18173; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
18174; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
18175; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18176; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
18177; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18178; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18179; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18180; GFX12-WGP-NEXT:    s_endpgm
18181;
18182; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
18183; GFX12-CU:       ; %bb.0: ; %entry
18184; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
18185; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18186; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18187; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18188; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18189; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
18190; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
18191; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18192; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
18193; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18194; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18195; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18196; GFX12-CU-NEXT:    s_endpgm
18197    ptr addrspace(1) %out, i32 %in, i32 %old) {
18198entry:
18199  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
18200  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
18201  %val0 = extractvalue { i32, i1 } %val, 0
18202  store i32 %val0, ptr addrspace(1) %out, align 4
18203  ret void
18204}
18205
18206define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
18207; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18208; GFX6:       ; %bb.0: ; %entry
18209; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
18210; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18211; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
18212; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
18213; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18214; GFX6-NEXT:    s_mov_b32 s12, s5
18215; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
18216; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
18217; GFX6-NEXT:    s_mov_b32 s11, -1
18218; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
18219; GFX6-NEXT:    s_mov_b32 s5, s12
18220; GFX6-NEXT:    s_mov_b32 s6, s11
18221; GFX6-NEXT:    s_mov_b32 s7, s10
18222; GFX6-NEXT:    v_mov_b32_e32 v0, s9
18223; GFX6-NEXT:    v_mov_b32_e32 v2, s8
18224; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18225; GFX6-NEXT:    v_mov_b32_e32 v1, v2
18226; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
18227; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18228; GFX6-NEXT:    s_waitcnt vmcnt(0)
18229; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18230; GFX6-NEXT:    s_endpgm
18231;
18232; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18233; GFX7:       ; %bb.0: ; %entry
18234; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18235; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18236; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18237; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18238; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18239; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18240; GFX7-NEXT:    s_mov_b32 s6, s4
18241; GFX7-NEXT:    s_mov_b32 s7, s5
18242; GFX7-NEXT:    s_mov_b32 s11, s12
18243; GFX7-NEXT:    s_mov_b32 s10, s13
18244; GFX7-NEXT:    s_add_u32 s6, s6, s11
18245; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18246; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18247; GFX7-NEXT:    s_mov_b32 s7, s10
18248; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18249; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18250; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18251; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18252; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18253; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18254; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18255; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18256; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18257; GFX7-NEXT:    s_waitcnt vmcnt(0)
18258; GFX7-NEXT:    flat_store_dword v[0:1], v2
18259; GFX7-NEXT:    s_endpgm
18260;
18261; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18262; GFX10-WGP:       ; %bb.0: ; %entry
18263; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
18264; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18265; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
18266; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
18267; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18268; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18269; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
18270; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18271; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
18272; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18273; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18274; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
18275; GFX10-WGP-NEXT:    s_endpgm
18276;
18277; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18278; GFX10-CU:       ; %bb.0: ; %entry
18279; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
18280; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18281; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
18282; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
18283; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18284; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18285; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
18286; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18287; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
18288; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18289; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18290; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
18291; GFX10-CU-NEXT:    s_endpgm
18292;
18293; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18294; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18295; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18296; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18297; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18298; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18300; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
18301; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
18302; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
18303; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
18304; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
18305; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
18306; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
18307; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18308; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
18309; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
18310; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18311; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
18312; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
18313; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18314; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18315; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18316; SKIP-CACHE-INV-NEXT:    s_endpgm
18317;
18318; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18319; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18320; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18321; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18322; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18323; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18324; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18325; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18326; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18327; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18328; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18329; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18330; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18331; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18332; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18333;
18334; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18335; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18336; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18337; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18338; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18339; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18340; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18342; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18343; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18344; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18345; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18346; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18347; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18348; GFX90A-TGSPLIT-NEXT:    s_endpgm
18349;
18350; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18351; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18352; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18353; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18354; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18355; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18356; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18357; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18358; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18359; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18360; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18361; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18362; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18363; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18364; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18365;
18366; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18367; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18368; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18369; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18370; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18371; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18372; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18373; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18374; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18375; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18376; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18377; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18378; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18379; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18380; GFX940-TGSPLIT-NEXT:    s_endpgm
18381;
18382; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18383; GFX11-WGP:       ; %bb.0: ; %entry
18384; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
18385; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18386; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18387; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18388; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18389; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
18390; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
18391; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18392; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
18393; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18394; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18395; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18396; GFX11-WGP-NEXT:    s_endpgm
18397;
18398; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18399; GFX11-CU:       ; %bb.0: ; %entry
18400; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
18401; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18402; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18403; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18404; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18405; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
18406; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
18407; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18408; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
18409; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18410; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18411; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18412; GFX11-CU-NEXT:    s_endpgm
18413;
18414; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18415; GFX12-WGP:       ; %bb.0: ; %entry
18416; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
18417; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18418; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18419; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18420; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18421; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
18422; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
18423; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18424; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
18425; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18426; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18427; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18428; GFX12-WGP-NEXT:    s_endpgm
18429;
18430; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
18431; GFX12-CU:       ; %bb.0: ; %entry
18432; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
18433; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18434; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18435; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18436; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18437; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
18438; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
18439; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18440; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
18441; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18442; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18443; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18444; GFX12-CU-NEXT:    s_endpgm
18445    ptr addrspace(1) %out, i32 %in, i32 %old) {
18446entry:
18447  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
18448  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
18449  %val0 = extractvalue { i32, i1 } %val, 0
18450  store i32 %val0, ptr addrspace(1) %out, align 4
18451  ret void
18452}
18453
18454define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
18455; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18456; GFX6:       ; %bb.0: ; %entry
18457; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
18458; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18459; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
18460; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
18461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18462; GFX6-NEXT:    s_mov_b32 s12, s5
18463; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
18464; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
18465; GFX6-NEXT:    s_mov_b32 s11, -1
18466; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
18467; GFX6-NEXT:    s_mov_b32 s5, s12
18468; GFX6-NEXT:    s_mov_b32 s6, s11
18469; GFX6-NEXT:    s_mov_b32 s7, s10
18470; GFX6-NEXT:    v_mov_b32_e32 v0, s9
18471; GFX6-NEXT:    v_mov_b32_e32 v2, s8
18472; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18473; GFX6-NEXT:    v_mov_b32_e32 v1, v2
18474; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
18475; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18476; GFX6-NEXT:    s_waitcnt vmcnt(0)
18477; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18478; GFX6-NEXT:    s_endpgm
18479;
18480; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18481; GFX7:       ; %bb.0: ; %entry
18482; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18483; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18484; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18485; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18486; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18487; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18488; GFX7-NEXT:    s_mov_b32 s6, s4
18489; GFX7-NEXT:    s_mov_b32 s7, s5
18490; GFX7-NEXT:    s_mov_b32 s11, s12
18491; GFX7-NEXT:    s_mov_b32 s10, s13
18492; GFX7-NEXT:    s_add_u32 s6, s6, s11
18493; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18494; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18495; GFX7-NEXT:    s_mov_b32 s7, s10
18496; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18497; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18498; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18499; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18500; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18501; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18502; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18503; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18504; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18505; GFX7-NEXT:    s_waitcnt vmcnt(0)
18506; GFX7-NEXT:    flat_store_dword v[0:1], v2
18507; GFX7-NEXT:    s_endpgm
18508;
18509; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18510; GFX10-WGP:       ; %bb.0: ; %entry
18511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
18512; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18513; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
18514; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
18515; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18516; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18517; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
18518; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18519; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
18520; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18521; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18522; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
18523; GFX10-WGP-NEXT:    s_endpgm
18524;
18525; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18526; GFX10-CU:       ; %bb.0: ; %entry
18527; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
18528; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18529; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
18530; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
18531; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18532; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18533; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
18534; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18535; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
18536; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18537; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18538; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
18539; GFX10-CU-NEXT:    s_endpgm
18540;
18541; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18542; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18543; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18544; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18545; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18546; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18547; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18548; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
18549; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
18550; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
18551; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
18552; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
18553; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
18554; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
18555; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
18557; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
18558; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18559; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
18560; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
18561; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18562; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18563; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18564; SKIP-CACHE-INV-NEXT:    s_endpgm
18565;
18566; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18567; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18568; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18569; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18570; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18571; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18572; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18573; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18574; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18575; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18576; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18577; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18578; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18579; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18580; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18581;
18582; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18583; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18584; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18585; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18586; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18587; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18588; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18589; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18590; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18591; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18592; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18593; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18594; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18595; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18596; GFX90A-TGSPLIT-NEXT:    s_endpgm
18597;
18598; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18599; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18600; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18601; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18602; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18603; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18604; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18605; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18606; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18607; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18608; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18609; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18610; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18611; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18612; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18613;
18614; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18615; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18616; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18617; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18618; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18619; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18620; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18621; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18622; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18623; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18624; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18625; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18626; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18627; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18628; GFX940-TGSPLIT-NEXT:    s_endpgm
18629;
18630; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18631; GFX11-WGP:       ; %bb.0: ; %entry
18632; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
18633; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18634; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18635; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18636; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18637; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
18638; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
18639; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18640; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
18641; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18642; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18643; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18644; GFX11-WGP-NEXT:    s_endpgm
18645;
18646; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18647; GFX11-CU:       ; %bb.0: ; %entry
18648; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
18649; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18650; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18651; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18652; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18653; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
18654; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
18655; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18656; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
18657; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18658; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18659; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18660; GFX11-CU-NEXT:    s_endpgm
18661;
18662; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18663; GFX12-WGP:       ; %bb.0: ; %entry
18664; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
18665; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18666; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18667; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18668; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18669; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
18670; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
18671; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18672; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
18673; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18674; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18675; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18676; GFX12-WGP-NEXT:    s_endpgm
18677;
18678; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
18679; GFX12-CU:       ; %bb.0: ; %entry
18680; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
18681; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18682; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18683; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18684; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18685; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
18686; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
18687; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18688; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
18689; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18690; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18691; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18692; GFX12-CU-NEXT:    s_endpgm
18693    ptr addrspace(1) %out, i32 %in, i32 %old) {
18694entry:
18695  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
18696  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
18697  %val0 = extractvalue { i32, i1 } %val, 0
18698  store i32 %val0, ptr addrspace(1) %out, align 4
18699  ret void
18700}
18701
18702define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
18703; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18704; GFX6:       ; %bb.0: ; %entry
18705; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
18706; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18707; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
18708; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
18709; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18710; GFX6-NEXT:    s_mov_b32 s12, s5
18711; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
18712; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
18713; GFX6-NEXT:    s_mov_b32 s11, -1
18714; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
18715; GFX6-NEXT:    s_mov_b32 s5, s12
18716; GFX6-NEXT:    s_mov_b32 s6, s11
18717; GFX6-NEXT:    s_mov_b32 s7, s10
18718; GFX6-NEXT:    v_mov_b32_e32 v0, s9
18719; GFX6-NEXT:    v_mov_b32_e32 v2, s8
18720; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18721; GFX6-NEXT:    v_mov_b32_e32 v1, v2
18722; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
18723; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18724; GFX6-NEXT:    s_waitcnt vmcnt(0)
18725; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18726; GFX6-NEXT:    s_endpgm
18727;
18728; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18729; GFX7:       ; %bb.0: ; %entry
18730; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18731; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18732; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18733; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18734; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18735; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18736; GFX7-NEXT:    s_mov_b32 s6, s4
18737; GFX7-NEXT:    s_mov_b32 s7, s5
18738; GFX7-NEXT:    s_mov_b32 s11, s12
18739; GFX7-NEXT:    s_mov_b32 s10, s13
18740; GFX7-NEXT:    s_add_u32 s6, s6, s11
18741; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18742; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18743; GFX7-NEXT:    s_mov_b32 s7, s10
18744; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18745; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18746; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18747; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18748; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18749; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18750; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18751; GFX7-NEXT:    v_mov_b32_e32 v0, s4
18752; GFX7-NEXT:    v_mov_b32_e32 v1, s5
18753; GFX7-NEXT:    s_waitcnt vmcnt(0)
18754; GFX7-NEXT:    flat_store_dword v[0:1], v2
18755; GFX7-NEXT:    s_endpgm
18756;
18757; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18758; GFX10-WGP:       ; %bb.0: ; %entry
18759; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
18760; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18761; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
18762; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
18763; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18764; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
18765; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
18766; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18767; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
18768; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18769; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
18770; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
18771; GFX10-WGP-NEXT:    s_endpgm
18772;
18773; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18774; GFX10-CU:       ; %bb.0: ; %entry
18775; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
18776; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18777; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
18778; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
18779; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
18780; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
18781; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
18782; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18783; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
18784; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
18785; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
18786; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
18787; GFX10-CU-NEXT:    s_endpgm
18788;
18789; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18790; SKIP-CACHE-INV:       ; %bb.0: ; %entry
18791; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
18792; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
18793; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
18794; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
18795; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
18796; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
18797; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
18798; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
18799; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
18800; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
18801; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
18802; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
18803; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
18804; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
18805; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
18806; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
18808; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
18809; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18810; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
18811; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18812; SKIP-CACHE-INV-NEXT:    s_endpgm
18813;
18814; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18815; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
18816; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18817; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18818; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18819; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18820; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18821; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18822; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18823; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18824; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18825; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18826; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18827; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18828; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
18829;
18830; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18831; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
18832; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18833; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
18834; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
18835; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
18836; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18837; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
18838; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
18839; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18840; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18841; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
18842; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18843; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
18844; GFX90A-TGSPLIT-NEXT:    s_endpgm
18845;
18846; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18847; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
18848; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18849; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18850; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18851; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18852; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18853; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18854; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18855; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18856; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18857; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18858; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18859; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18860; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
18861;
18862; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18863; GFX940-TGSPLIT:       ; %bb.0: ; %entry
18864; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
18865; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
18866; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
18867; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
18868; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
18869; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
18870; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
18871; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18872; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
18873; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
18874; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
18875; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
18876; GFX940-TGSPLIT-NEXT:    s_endpgm
18877;
18878; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18879; GFX11-WGP:       ; %bb.0: ; %entry
18880; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
18881; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18882; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18883; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18884; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
18885; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
18886; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
18887; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18888; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
18889; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18890; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
18891; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18892; GFX11-WGP-NEXT:    s_endpgm
18893;
18894; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18895; GFX11-CU:       ; %bb.0: ; %entry
18896; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
18897; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18898; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18899; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18900; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
18901; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
18902; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
18903; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18904; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
18905; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
18906; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
18907; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18908; GFX11-CU-NEXT:    s_endpgm
18909;
18910; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18911; GFX12-WGP:       ; %bb.0: ; %entry
18912; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
18913; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18914; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
18915; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
18916; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
18917; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
18918; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
18919; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18920; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
18921; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18922; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
18923; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
18924; GFX12-WGP-NEXT:    s_endpgm
18925;
18926; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
18927; GFX12-CU:       ; %bb.0: ; %entry
18928; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
18929; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
18930; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
18931; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
18932; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
18933; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
18934; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
18935; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
18936; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
18937; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
18938; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
18939; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
18940; GFX12-CU-NEXT:    s_endpgm
18941    ptr addrspace(1) %out, i32 %in, i32 %old) {
18942entry:
18943  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
18944  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
18945  %val0 = extractvalue { i32, i1 } %val, 0
18946  store i32 %val0, ptr addrspace(1) %out, align 4
18947  ret void
18948}
18949
18950define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
18951; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
18952; GFX6:       ; %bb.0: ; %entry
18953; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
18954; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18955; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
18956; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
18957; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18958; GFX6-NEXT:    s_mov_b32 s12, s5
18959; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
18960; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
18961; GFX6-NEXT:    s_mov_b32 s11, -1
18962; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
18963; GFX6-NEXT:    s_mov_b32 s5, s12
18964; GFX6-NEXT:    s_mov_b32 s6, s11
18965; GFX6-NEXT:    s_mov_b32 s7, s10
18966; GFX6-NEXT:    v_mov_b32_e32 v0, s9
18967; GFX6-NEXT:    v_mov_b32_e32 v2, s8
18968; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
18969; GFX6-NEXT:    v_mov_b32_e32 v1, v2
18970; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
18971; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
18972; GFX6-NEXT:    s_waitcnt vmcnt(0)
18973; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18974; GFX6-NEXT:    s_endpgm
18975;
18976; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
18977; GFX7:       ; %bb.0: ; %entry
18978; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
18979; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
18980; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
18981; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
18982; GFX7-NEXT:    s_mov_b64 s[12:13], 16
18983; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18984; GFX7-NEXT:    s_mov_b32 s6, s4
18985; GFX7-NEXT:    s_mov_b32 s7, s5
18986; GFX7-NEXT:    s_mov_b32 s11, s12
18987; GFX7-NEXT:    s_mov_b32 s10, s13
18988; GFX7-NEXT:    s_add_u32 s6, s6, s11
18989; GFX7-NEXT:    s_addc_u32 s10, s7, s10
18990; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18991; GFX7-NEXT:    s_mov_b32 s7, s10
18992; GFX7-NEXT:    v_mov_b32_e32 v2, s9
18993; GFX7-NEXT:    v_mov_b32_e32 v0, s8
18994; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18995; GFX7-NEXT:    v_mov_b32_e32 v3, v0
18996; GFX7-NEXT:    v_mov_b32_e32 v0, s6
18997; GFX7-NEXT:    v_mov_b32_e32 v1, s7
18998; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18999; GFX7-NEXT:    v_mov_b32_e32 v0, s4
19000; GFX7-NEXT:    v_mov_b32_e32 v1, s5
19001; GFX7-NEXT:    s_waitcnt vmcnt(0)
19002; GFX7-NEXT:    flat_store_dword v[0:1], v2
19003; GFX7-NEXT:    s_endpgm
19004;
19005; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19006; GFX10-WGP:       ; %bb.0: ; %entry
19007; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
19008; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19009; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
19010; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
19011; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19012; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
19013; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
19014; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19015; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
19016; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
19017; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
19018; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
19019; GFX10-WGP-NEXT:    s_endpgm
19020;
19021; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19022; GFX10-CU:       ; %bb.0: ; %entry
19023; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
19024; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19025; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
19026; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
19027; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
19028; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
19029; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
19030; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19031; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
19032; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
19033; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
19034; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
19035; GFX10-CU-NEXT:    s_endpgm
19036;
19037; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19038; SKIP-CACHE-INV:       ; %bb.0: ; %entry
19039; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
19040; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
19041; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
19042; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
19043; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
19044; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
19045; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
19046; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
19047; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
19048; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
19049; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
19050; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
19051; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
19052; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
19053; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
19054; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
19055; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
19056; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
19057; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
19058; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
19059; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
19060; SKIP-CACHE-INV-NEXT:    s_endpgm
19061;
19062; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19063; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
19064; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
19065; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19066; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19067; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19068; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19069; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19070; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
19071; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19072; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
19073; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
19074; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19075; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
19076; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
19077;
19078; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19079; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
19080; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
19081; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
19082; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
19083; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
19084; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19085; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
19086; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
19087; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19088; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
19089; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
19090; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19091; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
19092; GFX90A-TGSPLIT-NEXT:    s_endpgm
19093;
19094; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19095; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
19096; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
19097; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19098; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19099; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19100; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19101; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19102; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
19103; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19104; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
19105; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
19106; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19107; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
19108; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
19109;
19110; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19111; GFX940-TGSPLIT:       ; %bb.0: ; %entry
19112; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
19113; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19114; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
19115; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
19116; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
19117; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
19118; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
19119; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19120; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
19121; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
19122; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
19123; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
19124; GFX940-TGSPLIT-NEXT:    s_endpgm
19125;
19126; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19127; GFX11-WGP:       ; %bb.0: ; %entry
19128; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
19129; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19130; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19131; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19132; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
19133; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
19134; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
19135; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19136; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
19137; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
19138; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
19139; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
19140; GFX11-WGP-NEXT:    s_endpgm
19141;
19142; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19143; GFX11-CU:       ; %bb.0: ; %entry
19144; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
19145; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19146; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19147; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19148; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
19149; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
19150; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
19151; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19152; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
19153; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
19154; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
19155; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
19156; GFX11-CU-NEXT:    s_endpgm
19157;
19158; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19159; GFX12-WGP:       ; %bb.0: ; %entry
19160; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
19161; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19162; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
19163; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
19164; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
19165; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
19166; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
19167; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19168; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
19169; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
19170; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
19171; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
19172; GFX12-WGP-NEXT:    s_endpgm
19173;
19174; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
19175; GFX12-CU:       ; %bb.0: ; %entry
19176; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
19177; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
19178; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
19179; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
19180; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
19181; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
19182; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
19183; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
19184; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
19185; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
19186; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
19187; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
19188; GFX12-CU-NEXT:    s_endpgm
19189    ptr addrspace(1) %out, i32 %in, i32 %old) {
19190entry:
19191  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
19192  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
19193  %val0 = extractvalue { i32, i1 } %val, 0
19194  store i32 %val0, ptr addrspace(1) %out, align 4
19195  ret void
19196}
19197