xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (revision 924a64a3486f9962c42d4ec253774eb2c586ac33)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s
7
8define float @syncscope_system(ptr %addr, float %val) #0 {
9; GFX908-LABEL: syncscope_system:
10; GFX908:       ; %bb.0:
11; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX908-NEXT:    flat_load_dword v3, v[0:1]
13; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
15; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17; GFX908-NEXT:    v_mov_b32_e32 v4, v3
18; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
19; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
20; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
21; GFX908-NEXT:    buffer_wbinvl1_vol
22; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
23; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
24; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
25; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
26; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
27; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
28; GFX908-NEXT:    v_mov_b32_e32 v0, v3
29; GFX908-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX90A-LABEL: syncscope_system:
32; GFX90A:       ; %bb.0:
33; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
35; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
36; GFX90A-NEXT:    ; implicit-def: $vgpr3
37; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
38; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
39; GFX90A-NEXT:    s_cbranch_execz .LBB0_6
40; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
41; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
42; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
43; GFX90A-NEXT:    ; implicit-def: $vgpr3
44; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
45; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
46; GFX90A-NEXT:    s_cbranch_execz .LBB0_3
47; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
48; GFX90A-NEXT:    buffer_wbl2
49; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
50; GFX90A-NEXT:    s_waitcnt vmcnt(0)
51; GFX90A-NEXT:    buffer_invl2
52; GFX90A-NEXT:    buffer_wbinvl1_vol
53; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
54; GFX90A-NEXT:    ; implicit-def: $vgpr2
55; GFX90A-NEXT:  .LBB0_3: ; %Flow
56; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
57; GFX90A-NEXT:    s_cbranch_execz .LBB0_5
58; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
59; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
60; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
61; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
62; GFX90A-NEXT:    s_waitcnt vmcnt(0)
63; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
64; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
65; GFX90A-NEXT:  .LBB0_5: ; %Flow1
66; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
67; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
68; GFX90A-NEXT:    ; implicit-def: $vgpr2
69; GFX90A-NEXT:  .LBB0_6: ; %Flow2
70; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
71; GFX90A-NEXT:    s_cbranch_execz .LBB0_8
72; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
73; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
74; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
75; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
76; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX90A-NEXT:  .LBB0_8: ; %atomicrmw.phi
78; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
79; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
80; GFX90A-NEXT:    s_waitcnt vmcnt(0)
81; GFX90A-NEXT:    s_setpc_b64 s[30:31]
82;
83; GFX940-LABEL: syncscope_system:
84; GFX940:       ; %bb.0:
85; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GFX940-NEXT:    buffer_wbl2 sc0 sc1
87; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
88; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
89; GFX940-NEXT:    buffer_inv sc0 sc1
90; GFX940-NEXT:    s_setpc_b64 s[30:31]
91;
92; GFX1100-LABEL: syncscope_system:
93; GFX1100:       ; %bb.0:
94; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
96; GFX1100-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
97; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
98; GFX1100-NEXT:    buffer_gl1_inv
99; GFX1100-NEXT:    buffer_gl0_inv
100; GFX1100-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX1200-LABEL: syncscope_system:
103; GFX1200:       ; %bb.0:
104; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
105; GFX1200-NEXT:    s_wait_expcnt 0x0
106; GFX1200-NEXT:    s_wait_samplecnt 0x0
107; GFX1200-NEXT:    s_wait_bvhcnt 0x0
108; GFX1200-NEXT:    s_wait_kmcnt 0x0
109; GFX1200-NEXT:    global_wb scope:SCOPE_SYS
110; GFX1200-NEXT:    s_wait_storecnt 0x0
111; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
112; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
113; GFX1200-NEXT:    global_inv scope:SCOPE_SYS
114; GFX1200-NEXT:    s_setpc_b64 s[30:31]
115  %res = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
116  ret float %res
117}
118
119define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
120; GFX908-LABEL: syncscope_workgroup_rtn:
121; GFX908:       ; %bb.0:
122; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX908-NEXT:    flat_load_dword v3, v[0:1]
124; GFX908-NEXT:    s_mov_b64 s[4:5], 0
125; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
126; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
127; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
128; GFX908-NEXT:    v_mov_b32_e32 v4, v3
129; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
130; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
131; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
132; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
133; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
134; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
135; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
136; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
137; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
138; GFX908-NEXT:    v_mov_b32_e32 v0, v3
139; GFX908-NEXT:    s_setpc_b64 s[30:31]
140;
141; GFX90A-LABEL: syncscope_workgroup_rtn:
142; GFX90A:       ; %bb.0:
143; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
145; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
146; GFX90A-NEXT:    ; implicit-def: $vgpr3
147; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
148; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
149; GFX90A-NEXT:    s_cbranch_execz .LBB1_6
150; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
151; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
152; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
153; GFX90A-NEXT:    ; implicit-def: $vgpr3
154; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
155; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
156; GFX90A-NEXT:    s_cbranch_execz .LBB1_3
157; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
158; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
159; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
160; GFX90A-NEXT:    ; implicit-def: $vgpr2
161; GFX90A-NEXT:  .LBB1_3: ; %Flow
162; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
163; GFX90A-NEXT:    s_cbranch_execz .LBB1_5
164; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
165; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
166; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
167; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
168; GFX90A-NEXT:    s_waitcnt vmcnt(0)
169; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
170; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
171; GFX90A-NEXT:  .LBB1_5: ; %Flow1
172; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
173; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
174; GFX90A-NEXT:    ; implicit-def: $vgpr2
175; GFX90A-NEXT:  .LBB1_6: ; %Flow2
176; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
177; GFX90A-NEXT:    s_cbranch_execz .LBB1_8
178; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
179; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
180; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
181; GFX90A-NEXT:    s_waitcnt vmcnt(0)
182; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
183; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.phi
185; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
186; GFX90A-NEXT:    s_waitcnt vmcnt(0)
187; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
188; GFX90A-NEXT:    s_setpc_b64 s[30:31]
189;
190; GFX940-LABEL: syncscope_workgroup_rtn:
191; GFX940:       ; %bb.0:
192; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
194; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
195; GFX940-NEXT:    s_setpc_b64 s[30:31]
196;
197; GFX1100-LABEL: syncscope_workgroup_rtn:
198; GFX1100:       ; %bb.0:
199; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
201; GFX1100-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
202; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
203; GFX1100-NEXT:    buffer_gl0_inv
204; GFX1100-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX1200-LABEL: syncscope_workgroup_rtn:
207; GFX1200:       ; %bb.0:
208; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
209; GFX1200-NEXT:    s_wait_expcnt 0x0
210; GFX1200-NEXT:    s_wait_samplecnt 0x0
211; GFX1200-NEXT:    s_wait_bvhcnt 0x0
212; GFX1200-NEXT:    s_wait_kmcnt 0x0
213; GFX1200-NEXT:    s_wait_storecnt 0x0
214; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
215; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
216; GFX1200-NEXT:    global_inv scope:SCOPE_SE
217; GFX1200-NEXT:    s_setpc_b64 s[30:31]
218  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
219  ret float %res
220}
221
222define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
223; GFX908-LABEL: syncscope_workgroup_nortn:
224; GFX908:       ; %bb.0:
225; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
227; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
228; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
229; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
230; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
231; GFX908-NEXT:  ; %bb.1: ; %Flow2
232; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
233; GFX908-NEXT:    s_cbranch_execnz .LBB2_8
234; GFX908-NEXT:  .LBB2_2: ; %atomicrmw.phi
235; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
236; GFX908-NEXT:    s_waitcnt vmcnt(0)
237; GFX908-NEXT:    s_setpc_b64 s[30:31]
238; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.check.private
239; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
240; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
241; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
242; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
243; GFX908-NEXT:    s_cbranch_execz .LBB2_5
244; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
245; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
246; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
247; GFX908-NEXT:    ; implicit-def: $vgpr2
248; GFX908-NEXT:  .LBB2_5: ; %Flow
249; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
250; GFX908-NEXT:    s_cbranch_execz .LBB2_7
251; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
252; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
253; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
254; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
255; GFX908-NEXT:    s_waitcnt vmcnt(0)
256; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
257; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
258; GFX908-NEXT:  .LBB2_7: ; %Flow1
259; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
260; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
261; GFX908-NEXT:    ; implicit-def: $vgpr2
262; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
263; GFX908-NEXT:    s_cbranch_execz .LBB2_2
264; GFX908-NEXT:  .LBB2_8: ; %atomicrmw.shared
265; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
266; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
267; GFX908-NEXT:    ds_add_f32 v0, v2
268; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
270; GFX908-NEXT:    s_waitcnt vmcnt(0)
271; GFX908-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX90A-LABEL: syncscope_workgroup_nortn:
274; GFX90A:       ; %bb.0:
275; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
277; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
278; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
279; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
280; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
281; GFX90A-NEXT:  ; %bb.1: ; %Flow2
282; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
283; GFX90A-NEXT:    s_cbranch_execnz .LBB2_8
284; GFX90A-NEXT:  .LBB2_2: ; %atomicrmw.phi
285; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
286; GFX90A-NEXT:    s_waitcnt vmcnt(0)
287; GFX90A-NEXT:    s_setpc_b64 s[30:31]
288; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.check.private
289; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
290; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
291; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
292; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
293; GFX90A-NEXT:    s_cbranch_execz .LBB2_5
294; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
295; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
296; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
297; GFX90A-NEXT:    ; implicit-def: $vgpr2
298; GFX90A-NEXT:  .LBB2_5: ; %Flow
299; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
300; GFX90A-NEXT:    s_cbranch_execz .LBB2_7
301; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
302; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
303; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
304; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
305; GFX90A-NEXT:    s_waitcnt vmcnt(0)
306; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
307; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
308; GFX90A-NEXT:  .LBB2_7: ; %Flow1
309; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
310; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
311; GFX90A-NEXT:    ; implicit-def: $vgpr2
312; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
313; GFX90A-NEXT:    s_cbranch_execz .LBB2_2
314; GFX90A-NEXT:  .LBB2_8: ; %atomicrmw.shared
315; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
316; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
317; GFX90A-NEXT:    ds_add_f32 v0, v2
318; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
320; GFX90A-NEXT:    s_waitcnt vmcnt(0)
321; GFX90A-NEXT:    s_setpc_b64 s[30:31]
322;
323; GFX940-LABEL: syncscope_workgroup_nortn:
324; GFX940:       ; %bb.0:
325; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
327; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
328; GFX940-NEXT:    s_setpc_b64 s[30:31]
329;
330; GFX1100-LABEL: syncscope_workgroup_nortn:
331; GFX1100:       ; %bb.0:
332; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
334; GFX1100-NEXT:    flat_atomic_add_f32 v[0:1], v2
335; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
337; GFX1100-NEXT:    buffer_gl0_inv
338; GFX1100-NEXT:    s_setpc_b64 s[30:31]
339;
340; GFX1200-LABEL: syncscope_workgroup_nortn:
341; GFX1200:       ; %bb.0:
342; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
343; GFX1200-NEXT:    s_wait_expcnt 0x0
344; GFX1200-NEXT:    s_wait_samplecnt 0x0
345; GFX1200-NEXT:    s_wait_bvhcnt 0x0
346; GFX1200-NEXT:    s_wait_kmcnt 0x0
347; GFX1200-NEXT:    s_wait_storecnt 0x0
348; GFX1200-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
349; GFX1200-NEXT:    s_wait_storecnt_dscnt 0x0
350; GFX1200-NEXT:    global_inv scope:SCOPE_SE
351; GFX1200-NEXT:    s_setpc_b64 s[30:31]
352  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
353  ret void
354}
355
356define float @no_unsafe(ptr %addr, float %val) {
357; GFX908-LABEL: no_unsafe:
358; GFX908:       ; %bb.0:
359; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; GFX908-NEXT:    flat_load_dword v3, v[0:1]
361; GFX908-NEXT:    s_mov_b64 s[4:5], 0
362; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
363; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
364; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
365; GFX908-NEXT:    v_mov_b32_e32 v4, v3
366; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
367; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
368; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
369; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
370; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
371; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
372; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
373; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
374; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
375; GFX908-NEXT:    v_mov_b32_e32 v0, v3
376; GFX908-NEXT:    s_setpc_b64 s[30:31]
377;
378; GFX90A-LABEL: no_unsafe:
379; GFX90A:       ; %bb.0:
380; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
382; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
383; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
384; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
385; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
386; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
387; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
388; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
389; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
390; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
391; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
392; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
393; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
394; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
395; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
396; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
397; GFX90A-NEXT:    s_setpc_b64 s[30:31]
398;
399; GFX940-LABEL: no_unsafe:
400; GFX940:       ; %bb.0:
401; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
403; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
404; GFX940-NEXT:    s_setpc_b64 s[30:31]
405;
406; GFX1100-LABEL: no_unsafe:
407; GFX1100:       ; %bb.0:
408; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409; GFX1100-NEXT:    flat_load_b32 v3, v[0:1]
410; GFX1100-NEXT:    s_mov_b32 s0, 0
411; GFX1100-NEXT:  .LBB3_1: ; %atomicrmw.start
412; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
413; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
414; GFX1100-NEXT:    v_mov_b32_e32 v4, v3
415; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
416; GFX1100-NEXT:    v_add_f32_e32 v3, v4, v2
417; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
418; GFX1100-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
419; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
420; GFX1100-NEXT:    buffer_gl0_inv
421; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
422; GFX1100-NEXT:    s_or_b32 s0, vcc_lo, s0
423; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
424; GFX1100-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
425; GFX1100-NEXT:    s_cbranch_execnz .LBB3_1
426; GFX1100-NEXT:  ; %bb.2: ; %atomicrmw.end
427; GFX1100-NEXT:    s_or_b32 exec_lo, exec_lo, s0
428; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
429; GFX1100-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX1200-LABEL: no_unsafe:
432; GFX1200:       ; %bb.0:
433; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
434; GFX1200-NEXT:    s_wait_expcnt 0x0
435; GFX1200-NEXT:    s_wait_samplecnt 0x0
436; GFX1200-NEXT:    s_wait_bvhcnt 0x0
437; GFX1200-NEXT:    s_wait_kmcnt 0x0
438; GFX1200-NEXT:    s_wait_storecnt 0x0
439; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
440; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
441; GFX1200-NEXT:    global_inv scope:SCOPE_SE
442; GFX1200-NEXT:    s_setpc_b64 s[30:31]
443  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
444  ret float %res
445}
446
447attributes #0 = { nounwind }
448
449!0 = !{}
450