xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9
3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10
5; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12
8
9; from atomicrmw-expand.ll
10; covers flat_load, flat_atomic (atomic with return)
11;
12define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
13; GFX9-LABEL: syncscope_workgroup_nortn:
14; GFX9:       ; %bb.0:
15; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX9-NEXT:    flat_load_dword v4, v[0:1]
17; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18; GFX9-NEXT:    s_mov_b64 s[4:5], 0
19; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
20; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
21; GFX9-NEXT:    v_add_f32_e32 v3, v4, v2
22; GFX9-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
23; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
24; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
25; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
26; GFX9-NEXT:    v_mov_b32_e32 v4, v3
27; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
28; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
29; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
30; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
31; GFX9-NEXT:    s_setpc_b64 s[30:31]
32;
33; GFX90A-LABEL: syncscope_workgroup_nortn:
34; GFX90A:       ; %bb.0:
35; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
37; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
38; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
39; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
40; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
41; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
42; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
43; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
44; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
45; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
46; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
47; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
48; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
49; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
50; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
51; GFX90A-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX10-LABEL: syncscope_workgroup_nortn:
54; GFX10:       ; %bb.0:
55; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    flat_load_dword v4, v[0:1]
57; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
58; GFX10-NEXT:    s_mov_b32 s4, 0
59; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
60; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
61; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
62; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
63; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
64; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX10-NEXT:    buffer_gl0_inv
66; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
67; GFX10-NEXT:    v_mov_b32_e32 v4, v3
68; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
69; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
70; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
71; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
72; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
73; GFX10-NEXT:    s_setpc_b64 s[30:31]
74;
75; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn:
76; GFX9-FLATSCR:       ; %bb.0:
77; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX9-FLATSCR-NEXT:    flat_load_dword v4, v[0:1]
79; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
80; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], 0
81; GFX9-FLATSCR-NEXT:  .LBB0_1: ; %atomicrmw.start
82; GFX9-FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
83; GFX9-FLATSCR-NEXT:    v_add_f32_e32 v3, v4, v2
84; GFX9-FLATSCR-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
85; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
86; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
87; GFX9-FLATSCR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
88; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, v3
89; GFX9-FLATSCR-NEXT:    s_andn2_b64 exec, exec, s[0:1]
90; GFX9-FLATSCR-NEXT:    s_cbranch_execnz .LBB0_1
91; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %atomicrmw.end
92; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
93; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
94;
95; GFX11-LABEL: syncscope_workgroup_nortn:
96; GFX11:       ; %bb.0:
97; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
99; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
100; GFX11-NEXT:    s_mov_b32 s0, 0
101; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
102; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
103; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
104; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
105; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
106; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
107; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
108; GFX11-NEXT:    buffer_gl0_inv
109; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
110; GFX11-NEXT:    v_mov_b32_e32 v4, v3
111; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
112; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
113; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
114; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
115; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
116; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
117; GFX11-NEXT:    s_setpc_b64 s[30:31]
118;
119; GFX12-LABEL: syncscope_workgroup_nortn:
120; GFX12:       ; %bb.0:
121; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
122; GFX12-NEXT:    s_wait_expcnt 0x0
123; GFX12-NEXT:    s_wait_samplecnt 0x0
124; GFX12-NEXT:    s_wait_bvhcnt 0x0
125; GFX12-NEXT:    s_wait_kmcnt 0x0
126; GFX12-NEXT:    s_wait_storecnt 0x0
127; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
128; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
129; GFX12-NEXT:    global_inv scope:SCOPE_SE
130; GFX12-NEXT:    s_setpc_b64 s[30:31]
131  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
132  ret void
133}
134
135; from atomicrmw-nand.ll
136; covers global_atomic (atomic with return), global_load
137;
138define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
139; GFX9-LABEL: atomic_nand_i32_global:
140; GFX9:       ; %bb.0:
141; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142; GFX9-NEXT:    global_load_dword v2, v[0:1], off
143; GFX9-NEXT:    s_waitcnt vmcnt(0)
144; GFX9-NEXT:    s_mov_b64 s[4:5], 0
145; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
146; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
147; GFX9-NEXT:    v_mov_b32_e32 v3, v2
148; GFX9-NEXT:    v_not_b32_e32 v2, v3
149; GFX9-NEXT:    v_or_b32_e32 v2, -5, v2
150; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
151; GFX9-NEXT:    s_waitcnt vmcnt(0)
152; GFX9-NEXT:    buffer_wbinvl1_vol
153; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
154; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
155; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
156; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
157; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
158; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
159; GFX9-NEXT:    v_mov_b32_e32 v0, v2
160; GFX9-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX90A-LABEL: atomic_nand_i32_global:
163; GFX90A:       ; %bb.0:
164; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX90A-NEXT:    global_load_dword v2, v[0:1], off
166; GFX90A-NEXT:    s_waitcnt vmcnt(0)
167; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
168; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
169; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
170; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
171; GFX90A-NEXT:    v_not_b32_e32 v2, v3
172; GFX90A-NEXT:    v_or_b32_e32 v2, -5, v2
173; GFX90A-NEXT:    buffer_wbl2
174; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
175; GFX90A-NEXT:    s_waitcnt vmcnt(0)
176; GFX90A-NEXT:    buffer_invl2
177; GFX90A-NEXT:    buffer_wbinvl1_vol
178; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
179; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
180; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
181; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
182; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
183; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
184; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
185; GFX90A-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX10-LABEL: atomic_nand_i32_global:
188; GFX10:       ; %bb.0:
189; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX10-NEXT:    global_load_dword v2, v[0:1], off
191; GFX10-NEXT:    s_waitcnt vmcnt(0)
192; GFX10-NEXT:    s_mov_b32 s4, 0
193; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
194; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
195; GFX10-NEXT:    v_mov_b32_e32 v3, v2
196; GFX10-NEXT:    v_not_b32_e32 v2, v3
197; GFX10-NEXT:    v_or_b32_e32 v2, -5, v2
198; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
199; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
200; GFX10-NEXT:    s_waitcnt vmcnt(0)
201; GFX10-NEXT:    buffer_gl1_inv
202; GFX10-NEXT:    buffer_gl0_inv
203; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
204; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
205; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
206; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
207; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
208; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
209; GFX10-NEXT:    v_mov_b32_e32 v0, v2
210; GFX10-NEXT:    s_setpc_b64 s[30:31]
211;
212; GFX9-FLATSCR-LABEL: atomic_nand_i32_global:
213; GFX9-FLATSCR:       ; %bb.0:
214; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GFX9-FLATSCR-NEXT:    global_load_dword v2, v[0:1], off
216; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
217; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], 0
218; GFX9-FLATSCR-NEXT:  .LBB1_1: ; %atomicrmw.start
219; GFX9-FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
220; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v2
221; GFX9-FLATSCR-NEXT:    v_not_b32_e32 v2, v3
222; GFX9-FLATSCR-NEXT:    v_or_b32_e32 v2, -5, v2
223; GFX9-FLATSCR-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
224; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
225; GFX9-FLATSCR-NEXT:    buffer_wbinvl1_vol
226; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
227; GFX9-FLATSCR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
228; GFX9-FLATSCR-NEXT:    s_andn2_b64 exec, exec, s[0:1]
229; GFX9-FLATSCR-NEXT:    s_cbranch_execnz .LBB1_1
230; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %atomicrmw.end
231; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
232; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
233; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
234;
235; GFX11-LABEL: atomic_nand_i32_global:
236; GFX11:       ; %bb.0:
237; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; GFX11-NEXT:    global_load_b32 v2, v[0:1], off
239; GFX11-NEXT:    s_waitcnt vmcnt(0)
240; GFX11-NEXT:    s_mov_b32 s0, 0
241; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
242; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
243; GFX11-NEXT:    v_mov_b32_e32 v3, v2
244; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
245; GFX11-NEXT:    v_not_b32_e32 v2, v3
246; GFX11-NEXT:    v_or_b32_e32 v2, -5, v2
247; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
248; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
249; GFX11-NEXT:    s_waitcnt vmcnt(0)
250; GFX11-NEXT:    buffer_gl1_inv
251; GFX11-NEXT:    buffer_gl0_inv
252; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
253; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
254; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
255; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
256; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
257; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
258; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
259; GFX11-NEXT:    v_mov_b32_e32 v0, v2
260; GFX11-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX12-LABEL: atomic_nand_i32_global:
263; GFX12:       ; %bb.0:
264; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
265; GFX12-NEXT:    s_wait_expcnt 0x0
266; GFX12-NEXT:    s_wait_samplecnt 0x0
267; GFX12-NEXT:    s_wait_bvhcnt 0x0
268; GFX12-NEXT:    s_wait_kmcnt 0x0
269; GFX12-NEXT:    global_load_b32 v2, v[0:1], off
270; GFX12-NEXT:    s_wait_loadcnt 0x0
271; GFX12-NEXT:    s_mov_b32 s0, 0
272; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
273; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
274; GFX12-NEXT:    v_mov_b32_e32 v3, v2
275; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
276; GFX12-NEXT:    v_not_b32_e32 v2, v3
277; GFX12-NEXT:    v_or_b32_e32 v2, -5, v2
278; GFX12-NEXT:    global_wb scope:SCOPE_SYS
279; GFX12-NEXT:    s_wait_storecnt 0x0
280; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
281; GFX12-NEXT:    s_wait_loadcnt 0x0
282; GFX12-NEXT:    global_inv scope:SCOPE_SYS
283; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
284; GFX12-NEXT:    s_wait_alu 0xfffe
285; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
286; GFX12-NEXT:    s_wait_alu 0xfffe
287; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
288; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
289; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
290; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
291; GFX12-NEXT:    v_mov_b32_e32 v0, v2
292; GFX12-NEXT:    s_wait_alu 0xfffe
293; GFX12-NEXT:    s_setpc_b64 s[30:31]
294  %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
295  ret i32 %result
296}
297
298; from call-argument-types.ll
299; covers scratch_load, scratch_store, buffer_load, buffer_store
300;
301declare hidden void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16)
302define void @tail_call_byval_align16(<32 x i32> %val, double %tmp)  {
303; GFX9-LABEL: tail_call_byval_align16:
304; GFX9:       ; %bb.0: ; %entry
305; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
307; GFX9-NEXT:    s_waitcnt vmcnt(0)
308; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
309; GFX9-NEXT:    s_waitcnt vmcnt(0)
310; GFX9-NEXT:    s_getpc_b64 s[16:17]
311; GFX9-NEXT:    s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
312; GFX9-NEXT:    s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
313; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:20
314; GFX9-NEXT:    s_waitcnt vmcnt(0)
315; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
316; GFX9-NEXT:    s_waitcnt vmcnt(0)
317; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:16
318; GFX9-NEXT:    s_waitcnt vmcnt(0)
319; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32
320; GFX9-NEXT:    s_waitcnt vmcnt(0)
321; GFX9-NEXT:    s_setpc_b64 s[16:17]
322;
323; GFX90A-LABEL: tail_call_byval_align16:
324; GFX90A:       ; %bb.0: ; %entry
325; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX90A-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
327; GFX90A-NEXT:    s_waitcnt vmcnt(0)
328; GFX90A-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
329; GFX90A-NEXT:    s_waitcnt vmcnt(0)
330; GFX90A-NEXT:    buffer_load_dword v34, off, s[0:3], s32
331; GFX90A-NEXT:    s_waitcnt vmcnt(0)
332; GFX90A-NEXT:    s_getpc_b64 s[16:17]
333; GFX90A-NEXT:    s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
334; GFX90A-NEXT:    s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
335; GFX90A-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:20
336; GFX90A-NEXT:    s_waitcnt vmcnt(0)
337; GFX90A-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:16
338; GFX90A-NEXT:    s_waitcnt vmcnt(0)
339; GFX90A-NEXT:    buffer_store_dword v34, off, s[0:3], s32
340; GFX90A-NEXT:    s_waitcnt vmcnt(0)
341; GFX90A-NEXT:    s_setpc_b64 s[16:17]
342;
343; GFX10-LABEL: tail_call_byval_align16:
344; GFX10:       ; %bb.0: ; %entry
345; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
347; GFX10-NEXT:    s_waitcnt vmcnt(0)
348; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
349; GFX10-NEXT:    s_waitcnt vmcnt(0)
350; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32
351; GFX10-NEXT:    s_waitcnt vmcnt(0)
352; GFX10-NEXT:    s_getpc_b64 s[16:17]
353; GFX10-NEXT:    s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
354; GFX10-NEXT:    s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
355; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:20
356; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
357; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:16
358; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
359; GFX10-NEXT:    buffer_store_dword v34, off, s[0:3], s32
360; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
361; GFX10-NEXT:    s_setpc_b64 s[16:17]
362;
363; GFX9-FLATSCR-LABEL: tail_call_byval_align16:
364; GFX9-FLATSCR:       ; %bb.0: ; %entry
365; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366; GFX9-FLATSCR-NEXT:    scratch_load_dword v32, off, s32
367; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
368; GFX9-FLATSCR-NEXT:    s_getpc_b64 s[0:1]
369; GFX9-FLATSCR-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4
370; GFX9-FLATSCR-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12
371; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v32, s32
372; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
373; GFX9-FLATSCR-NEXT:    scratch_load_dwordx2 v[32:33], off, s32 offset:24
374; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
375; GFX9-FLATSCR-NEXT:    scratch_store_dwordx2 off, v[32:33], s32 offset:16
376; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
377; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[0:1]
378;
379; GFX11-LABEL: tail_call_byval_align16:
380; GFX11:       ; %bb.0: ; %entry
381; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382; GFX11-NEXT:    scratch_load_b32 v32, off, s32
383; GFX11-NEXT:    s_waitcnt vmcnt(0)
384; GFX11-NEXT:    s_getpc_b64 s[0:1]
385; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4
386; GFX11-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12
387; GFX11-NEXT:    scratch_store_b32 off, v32, s32
388; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
389; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s32 offset:24
390; GFX11-NEXT:    s_waitcnt vmcnt(0)
391; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32 offset:16
392; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
393; GFX11-NEXT:    s_setpc_b64 s[0:1]
394;
395; GFX12-LABEL: tail_call_byval_align16:
396; GFX12:       ; %bb.0: ; %entry
397; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
398; GFX12-NEXT:    s_wait_expcnt 0x0
399; GFX12-NEXT:    s_wait_samplecnt 0x0
400; GFX12-NEXT:    s_wait_bvhcnt 0x0
401; GFX12-NEXT:    s_wait_kmcnt 0x0
402; GFX12-NEXT:    scratch_load_b32 v32, off, s32
403; GFX12-NEXT:    s_wait_loadcnt 0x0
404; GFX12-NEXT:    s_getpc_b64 s[0:1]
405; GFX12-NEXT:    s_wait_alu 0xfffe
406; GFX12-NEXT:    s_sext_i32_i16 s1, s1
407; GFX12-NEXT:    s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+12
408; GFX12-NEXT:    s_wait_alu 0xfffe
409; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+24
410; GFX12-NEXT:    scratch_store_b32 off, v32, s32
411; GFX12-NEXT:    s_wait_storecnt 0x0
412; GFX12-NEXT:    scratch_load_b64 v[32:33], off, s32 offset:24
413; GFX12-NEXT:    s_wait_loadcnt 0x0
414; GFX12-NEXT:    scratch_store_b64 off, v[32:33], s32 offset:16
415; GFX12-NEXT:    s_wait_storecnt 0x0
416; GFX12-NEXT:    s_wait_alu 0xfffe
417; GFX12-NEXT:    s_setpc_b64 s[0:1]
418entry:
419  %alloca = alloca double, align 8, addrspace(5)
420  tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
421  ret void
422}
423
424; from udiv.ll
425; covers s_load
426;
427define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
428; GFX9-LABEL: udiv_i32:
429; GFX9:       ; %bb.0:
430; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX9-NEXT:    v_mov_b32_e32 v1, 0
433; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
434; GFX9-NEXT:    s_sub_i32 s4, 0, s3
435; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
436; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
437; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
438; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
439; GFX9-NEXT:    s_mul_i32 s4, s4, s5
440; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
441; GFX9-NEXT:    s_add_i32 s5, s5, s4
442; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
443; GFX9-NEXT:    s_mul_i32 s5, s4, s3
444; GFX9-NEXT:    s_sub_i32 s2, s2, s5
445; GFX9-NEXT:    s_add_i32 s6, s4, 1
446; GFX9-NEXT:    s_sub_i32 s5, s2, s3
447; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
448; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
449; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
450; GFX9-NEXT:    s_add_i32 s5, s4, 1
451; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
452; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
453; GFX9-NEXT:    v_mov_b32_e32 v0, s2
454; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
455; GFX9-NEXT:    s_waitcnt vmcnt(0)
456; GFX9-NEXT:    s_endpgm
457;
458; GFX90A-LABEL: udiv_i32:
459; GFX90A:       ; %bb.0:
460; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
461; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
463; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
464; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
465; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
466; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
467; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
468; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
469; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
470; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
471; GFX90A-NEXT:    s_add_i32 s5, s5, s4
472; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
473; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
474; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
475; GFX90A-NEXT:    s_add_i32 s6, s4, 1
476; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
477; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
478; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
479; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
480; GFX90A-NEXT:    s_add_i32 s5, s4, 1
481; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
482; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
483; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
484; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
485; GFX90A-NEXT:    s_waitcnt vmcnt(0)
486; GFX90A-NEXT:    s_endpgm
487;
488; GFX10-LABEL: udiv_i32:
489; GFX10:       ; %bb.0:
490; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
491; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
493; GFX10-NEXT:    s_sub_i32 s5, 0, s3
494; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
495; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
496; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
497; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
498; GFX10-NEXT:    v_mov_b32_e32 v0, 0
499; GFX10-NEXT:    s_mul_i32 s5, s5, s4
500; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
501; GFX10-NEXT:    s_add_i32 s4, s4, s5
502; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
503; GFX10-NEXT:    s_mul_i32 s5, s4, s3
504; GFX10-NEXT:    s_sub_i32 s2, s2, s5
505; GFX10-NEXT:    s_add_i32 s5, s4, 1
506; GFX10-NEXT:    s_sub_i32 s6, s2, s3
507; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
508; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
509; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
510; GFX10-NEXT:    s_add_i32 s5, s4, 1
511; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
512; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
513; GFX10-NEXT:    v_mov_b32_e32 v1, s2
514; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
515; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
516; GFX10-NEXT:    s_endpgm
517;
518; GFX9-FLATSCR-LABEL: udiv_i32:
519; GFX9-FLATSCR:       ; %bb.0:
520; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
521; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
523; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
524; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
525; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
526; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
527; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
528; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
529; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
530; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
531; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
532; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
533; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
534; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
535; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
536; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
537; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
538; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
539; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
540; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
541; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
542; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
543; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
544; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
545; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
546; GFX9-FLATSCR-NEXT:    s_endpgm
547;
548; GFX11-LABEL: udiv_i32:
549; GFX11:       ; %bb.0:
550; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
551; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s3
553; GFX11-NEXT:    s_sub_i32 s5, 0, s3
554; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
555; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
556; GFX11-NEXT:    s_waitcnt_depctr 0xfff
557; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
558; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
559; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
560; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
561; GFX11-NEXT:    v_mov_b32_e32 v0, 0
562; GFX11-NEXT:    s_mul_i32 s5, s5, s4
563; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
564; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s5
565; GFX11-NEXT:    s_add_i32 s4, s4, s5
566; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
567; GFX11-NEXT:    s_mul_hi_u32 s4, s2, s4
568; GFX11-NEXT:    s_mul_i32 s5, s4, s3
569; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
570; GFX11-NEXT:    s_sub_i32 s2, s2, s5
571; GFX11-NEXT:    s_add_i32 s5, s4, 1
572; GFX11-NEXT:    s_sub_i32 s6, s2, s3
573; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
574; GFX11-NEXT:    s_cselect_b32 s4, s5, s4
575; GFX11-NEXT:    s_cselect_b32 s2, s6, s2
576; GFX11-NEXT:    s_add_i32 s5, s4, 1
577; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
578; GFX11-NEXT:    s_cselect_b32 s2, s5, s4
579; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
580; GFX11-NEXT:    v_mov_b32_e32 v1, s2
581; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
582; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
583; GFX11-NEXT:    s_endpgm
584;
585; GFX12-LABEL: udiv_i32:
586; GFX12:       ; %bb.0:
587; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
588; GFX12-NEXT:    s_wait_kmcnt 0x0
589; GFX12-NEXT:    s_cvt_f32_u32 s4, s3
590; GFX12-NEXT:    s_sub_co_i32 s5, 0, s3
591; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
592; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s4
593; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
594; GFX12-NEXT:    s_wait_alu 0xfffe
595; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
596; GFX12-NEXT:    s_mul_f32 s4, s4, 0x4f7ffffe
597; GFX12-NEXT:    s_wait_alu 0xfffe
598; GFX12-NEXT:    s_cvt_u32_f32 s4, s4
599; GFX12-NEXT:    s_wait_alu 0xfffe
600; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
601; GFX12-NEXT:    s_mul_i32 s5, s5, s4
602; GFX12-NEXT:    s_wait_alu 0xfffe
603; GFX12-NEXT:    s_mul_hi_u32 s5, s4, s5
604; GFX12-NEXT:    s_wait_alu 0xfffe
605; GFX12-NEXT:    s_add_co_i32 s4, s4, s5
606; GFX12-NEXT:    s_wait_alu 0xfffe
607; GFX12-NEXT:    s_mul_hi_u32 s4, s2, s4
608; GFX12-NEXT:    s_wait_alu 0xfffe
609; GFX12-NEXT:    s_mul_i32 s5, s4, s3
610; GFX12-NEXT:    s_wait_alu 0xfffe
611; GFX12-NEXT:    s_sub_co_i32 s2, s2, s5
612; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
613; GFX12-NEXT:    s_sub_co_i32 s6, s2, s3
614; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
615; GFX12-NEXT:    s_wait_alu 0xfffe
616; GFX12-NEXT:    s_cselect_b32 s4, s5, s4
617; GFX12-NEXT:    s_cselect_b32 s2, s6, s2
618; GFX12-NEXT:    s_wait_alu 0xfffe
619; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
620; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
621; GFX12-NEXT:    s_wait_alu 0xfffe
622; GFX12-NEXT:    s_cselect_b32 s2, s5, s4
623; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
624; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
625; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
626; GFX12-NEXT:    s_wait_storecnt 0x0
627; GFX12-NEXT:    s_endpgm
628  %r = udiv i32 %x, %y
629  store i32 %r, ptr addrspace(1) %out
630  ret void
631}
632
633declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
634
635; from smrd.ll
636; covers s_buffer_load
637;
638define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
639; GFX9-LABEL: smrd_sgpr_offset:
640; GFX9:       ; %bb.0: ; %main_body
641; GFX9-NEXT:    s_buffer_load_dword s0, s[0:3], s4 offset:0x0
642; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX9-NEXT:    v_mov_b32_e32 v0, s0
644; GFX9-NEXT:    ; return to shader part epilog
645;
646; GFX90A-LABEL: smrd_sgpr_offset:
647; GFX90A:       ; %bb.0: ; %main_body
648; GFX90A-NEXT:    s_buffer_load_dword s0, s[0:3], s4 offset:0x0
649; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
651; GFX90A-NEXT:    ; return to shader part epilog
652;
653; GFX10-LABEL: smrd_sgpr_offset:
654; GFX10:       ; %bb.0: ; %main_body
655; GFX10-NEXT:    s_buffer_load_dword s0, s[0:3], s4 offset:0x0
656; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX10-NEXT:    v_mov_b32_e32 v0, s0
658; GFX10-NEXT:    ; return to shader part epilog
659;
660; GFX9-FLATSCR-LABEL: smrd_sgpr_offset:
661; GFX9-FLATSCR:       ; %bb.0: ; %main_body
662; GFX9-FLATSCR-NEXT:    s_mov_b32 s11, s5
663; GFX9-FLATSCR-NEXT:    s_mov_b32 s10, s4
664; GFX9-FLATSCR-NEXT:    s_mov_b32 s9, s3
665; GFX9-FLATSCR-NEXT:    s_mov_b32 s8, s2
666; GFX9-FLATSCR-NEXT:    s_buffer_load_dword s0, s[8:11], s6 offset:0x0
667; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
668; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
669; GFX9-FLATSCR-NEXT:    ; return to shader part epilog
670;
671; GFX11-LABEL: smrd_sgpr_offset:
672; GFX11:       ; %bb.0: ; %main_body
673; GFX11-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
674; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX11-NEXT:    v_mov_b32_e32 v0, s0
676; GFX11-NEXT:    ; return to shader part epilog
677;
678; GFX12-LABEL: smrd_sgpr_offset:
679; GFX12:       ; %bb.0: ; %main_body
680; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
681; GFX12-NEXT:    s_wait_kmcnt 0x0
682; GFX12-NEXT:    v_mov_b32_e32 v0, s0
683; GFX12-NEXT:    ; return to shader part epilog
684main_body:
685  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
686  ret float %r
687}
688
689; from atomic_load_add.ll
690; covers s_load, ds_add (atomic without return)
691;
692define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
693; GFX9-LABEL: atomic_add_local:
694; GFX9:       ; %bb.0:
695; GFX9-NEXT:    s_mov_b64 s[0:1], exec
696; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
697; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
698; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
699; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
700; GFX9-NEXT:    s_cbranch_execz .LBB5_2
701; GFX9-NEXT:  ; %bb.1:
702; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x24
703; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
705; GFX9-NEXT:    s_mul_i32 s0, s0, 5
706; GFX9-NEXT:    v_mov_b32_e32 v1, s0
707; GFX9-NEXT:    v_mov_b32_e32 v0, s2
708; GFX9-NEXT:    ds_add_u32 v0, v1
709; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX9-NEXT:  .LBB5_2:
711; GFX9-NEXT:    s_endpgm
712;
713; GFX90A-LABEL: atomic_add_local:
714; GFX90A:       ; %bb.0:
715; GFX90A-NEXT:    s_mov_b64 s[0:1], exec
716; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
717; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
718; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
719; GFX90A-NEXT:    s_and_saveexec_b64 s[2:3], vcc
720; GFX90A-NEXT:    s_cbranch_execz .LBB5_2
721; GFX90A-NEXT:  ; %bb.1:
722; GFX90A-NEXT:    s_load_dword s2, s[4:5], 0x24
723; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
725; GFX90A-NEXT:    s_mul_i32 s0, s0, 5
726; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
727; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
728; GFX90A-NEXT:    ds_add_u32 v0, v1
729; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX90A-NEXT:  .LBB5_2:
731; GFX90A-NEXT:    s_endpgm
732;
733; GFX10-LABEL: atomic_add_local:
734; GFX10:       ; %bb.0:
735; GFX10-NEXT:    s_mov_b32 s0, exec_lo
736; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
737; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
738; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
739; GFX10-NEXT:    s_cbranch_execz .LBB5_2
740; GFX10-NEXT:  ; %bb.1:
741; GFX10-NEXT:    s_load_dword s1, s[4:5], 0x24
742; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX10-NEXT:    s_bcnt1_i32_b32 s0, s0
744; GFX10-NEXT:    s_mul_i32 s0, s0, 5
745; GFX10-NEXT:    v_mov_b32_e32 v1, s0
746; GFX10-NEXT:    v_mov_b32_e32 v0, s1
747; GFX10-NEXT:    ds_add_u32 v0, v1
748; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX10-NEXT:    buffer_gl0_inv
750; GFX10-NEXT:  .LBB5_2:
751; GFX10-NEXT:    s_endpgm
752;
753; GFX9-FLATSCR-LABEL: atomic_add_local:
754; GFX9-FLATSCR:       ; %bb.0:
755; GFX9-FLATSCR-NEXT:    s_mov_b64 s[0:1], exec
756; GFX9-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
757; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
758; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
759; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
760; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB5_2
761; GFX9-FLATSCR-NEXT:  ; %bb.1:
762; GFX9-FLATSCR-NEXT:    s_load_dword s2, s[4:5], 0x24
763; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-FLATSCR-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
765; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, 5
766; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, s0
767; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
768; GFX9-FLATSCR-NEXT:    ds_add_u32 v0, v1
769; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX9-FLATSCR-NEXT:  .LBB5_2:
771; GFX9-FLATSCR-NEXT:    s_endpgm
772;
773; GFX11-LABEL: atomic_add_local:
774; GFX11:       ; %bb.0:
775; GFX11-NEXT:    s_mov_b32 s0, exec_lo
776; GFX11-NEXT:    s_mov_b32 s1, exec_lo
777; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
778; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
779; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
780; GFX11-NEXT:    s_cbranch_execz .LBB5_2
781; GFX11-NEXT:  ; %bb.1:
782; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x24
783; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX11-NEXT:    s_bcnt1_i32_b32 s0, s0
785; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
786; GFX11-NEXT:    s_mul_i32 s0, s0, 5
787; GFX11-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
788; GFX11-NEXT:    ds_add_u32 v0, v1
789; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX11-NEXT:    buffer_gl0_inv
791; GFX11-NEXT:  .LBB5_2:
792; GFX11-NEXT:    s_endpgm
793;
794; GFX12-LABEL: atomic_add_local:
795; GFX12:       ; %bb.0:
796; GFX12-NEXT:    s_mov_b32 s0, exec_lo
797; GFX12-NEXT:    s_mov_b32 s1, exec_lo
798; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
799; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
800; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
801; GFX12-NEXT:    s_cbranch_execz .LBB5_2
802; GFX12-NEXT:  ; %bb.1:
803; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x24
804; GFX12-NEXT:    s_wait_kmcnt 0x0
805; GFX12-NEXT:    s_wait_alu 0xfffe
806; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s0
807; GFX12-NEXT:    s_wait_alu 0xfffe
808; GFX12-NEXT:    s_mul_i32 s0, s0, 5
809; GFX12-NEXT:    s_wait_alu 0xfffe
810; GFX12-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
811; GFX12-NEXT:    ds_add_u32 v0, v1
812; GFX12-NEXT:    s_wait_dscnt 0x0
813; GFX12-NEXT:    global_inv scope:SCOPE_SE
814; GFX12-NEXT:  .LBB5_2:
815; GFX12-NEXT:    s_endpgm
816   %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
817   ret void
818}
819
820; from flat_atomics_i32_system.ll
821; covers flat_atomic_swap (atomic without return)
822;
823define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
824; GFX9-LABEL: flat_atomic_xchg_i32_noret:
825; GFX9:       ; %bb.0:
826; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827; GFX9-NEXT:    flat_atomic_swap v[0:1], v2
828; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
829; GFX9-NEXT:    buffer_wbinvl1_vol
830; GFX9-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX90A-LABEL: flat_atomic_xchg_i32_noret:
833; GFX90A:       ; %bb.0:
834; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX90A-NEXT:    buffer_wbl2
836; GFX90A-NEXT:    flat_atomic_swap v[0:1], v2
837; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
838; GFX90A-NEXT:    buffer_invl2
839; GFX90A-NEXT:    buffer_wbinvl1_vol
840; GFX90A-NEXT:    s_setpc_b64 s[30:31]
841;
842; GFX10-LABEL: flat_atomic_xchg_i32_noret:
843; GFX10:       ; %bb.0:
844; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
845; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
846; GFX10-NEXT:    flat_atomic_swap v[0:1], v2
847; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
848; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
849; GFX10-NEXT:    buffer_gl1_inv
850; GFX10-NEXT:    buffer_gl0_inv
851; GFX10-NEXT:    s_setpc_b64 s[30:31]
852;
853; GFX9-FLATSCR-LABEL: flat_atomic_xchg_i32_noret:
854; GFX9-FLATSCR:       ; %bb.0:
855; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
856; GFX9-FLATSCR-NEXT:    flat_atomic_swap v[0:1], v2
857; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
858; GFX9-FLATSCR-NEXT:    buffer_wbinvl1_vol
859; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
860;
861; GFX11-LABEL: flat_atomic_xchg_i32_noret:
862; GFX11:       ; %bb.0:
863; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
864; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
865; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2
866; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
868; GFX11-NEXT:    buffer_gl1_inv
869; GFX11-NEXT:    buffer_gl0_inv
870; GFX11-NEXT:    s_setpc_b64 s[30:31]
871;
872; GFX12-LABEL: flat_atomic_xchg_i32_noret:
873; GFX12:       ; %bb.0:
874; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
875; GFX12-NEXT:    s_wait_expcnt 0x0
876; GFX12-NEXT:    s_wait_samplecnt 0x0
877; GFX12-NEXT:    s_wait_bvhcnt 0x0
878; GFX12-NEXT:    s_wait_kmcnt 0x0
879; GFX12-NEXT:    global_wb scope:SCOPE_SYS
880; GFX12-NEXT:    s_wait_storecnt 0x0
881; GFX12-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
882; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
883; GFX12-NEXT:    global_inv scope:SCOPE_SYS
884; GFX12-NEXT:    s_setpc_b64 s[30:31]
885  %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
886  ret void
887}
888
889; from atomic_load_add.ll
890; covers s_load, ds_add_rtn (atomic with return)
891;
892define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) {
893; GFX9-LABEL: atomic_add_ret_local:
894; GFX9:       ; %bb.0:
895; GFX9-NEXT:    s_mov_b64 s[2:3], exec
896; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
897; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
898; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
899; GFX9-NEXT:    ; implicit-def: $vgpr1
900; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
901; GFX9-NEXT:    s_cbranch_execz .LBB7_2
902; GFX9-NEXT:  ; %bb.1:
903; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
904; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
906; GFX9-NEXT:    s_mul_i32 s2, s2, 5
907; GFX9-NEXT:    v_mov_b32_e32 v2, s2
908; GFX9-NEXT:    v_mov_b32_e32 v1, s6
909; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
910; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
911; GFX9-NEXT:  .LBB7_2:
912; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
913; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
914; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
916; GFX9-NEXT:    v_mov_b32_e32 v2, 0
917; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
918; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
919; GFX9-NEXT:    s_waitcnt vmcnt(0)
920; GFX9-NEXT:    s_endpgm
921;
922; GFX90A-LABEL: atomic_add_ret_local:
923; GFX90A:       ; %bb.0:
924; GFX90A-NEXT:    s_mov_b64 s[2:3], exec
925; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
926; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
927; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
928; GFX90A-NEXT:    ; implicit-def: $vgpr1
929; GFX90A-NEXT:    s_and_saveexec_b64 s[0:1], vcc
930; GFX90A-NEXT:    s_cbranch_execz .LBB7_2
931; GFX90A-NEXT:  ; %bb.1:
932; GFX90A-NEXT:    s_load_dword s6, s[4:5], 0x2c
933; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX90A-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
935; GFX90A-NEXT:    s_mul_i32 s2, s2, 5
936; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
937; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
938; GFX90A-NEXT:    ds_add_rtn_u32 v1, v1, v2
939; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX90A-NEXT:  .LBB7_2:
941; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
942; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
943; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX90A-NEXT:    v_readfirstlane_b32 s2, v1
945; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
946; GFX90A-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
947; GFX90A-NEXT:    global_store_dword v2, v0, s[0:1]
948; GFX90A-NEXT:    s_waitcnt vmcnt(0)
949; GFX90A-NEXT:    s_endpgm
950;
951; GFX10-LABEL: atomic_add_ret_local:
952; GFX10:       ; %bb.0:
953; GFX10-NEXT:    s_mov_b32 s1, exec_lo
954; GFX10-NEXT:    ; implicit-def: $vgpr1
955; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
956; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
957; GFX10-NEXT:    s_and_saveexec_b32 s0, vcc_lo
958; GFX10-NEXT:    s_cbranch_execz .LBB7_2
959; GFX10-NEXT:  ; %bb.1:
960; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
961; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX10-NEXT:    s_bcnt1_i32_b32 s1, s1
963; GFX10-NEXT:    s_mul_i32 s1, s1, 5
964; GFX10-NEXT:    v_mov_b32_e32 v2, s1
965; GFX10-NEXT:    v_mov_b32_e32 v1, s2
966; GFX10-NEXT:    ds_add_rtn_u32 v1, v1, v2
967; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX10-NEXT:    buffer_gl0_inv
969; GFX10-NEXT:  .LBB7_2:
970; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
971; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
972; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
973; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX10-NEXT:    v_readfirstlane_b32 s2, v1
975; GFX10-NEXT:    v_mov_b32_e32 v1, 0
976; GFX10-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
977; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
978; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
979; GFX10-NEXT:    s_endpgm
980;
981; GFX9-FLATSCR-LABEL: atomic_add_ret_local:
982; GFX9-FLATSCR:       ; %bb.0:
983; GFX9-FLATSCR-NEXT:    s_mov_b64 s[2:3], exec
984; GFX9-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
985; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
986; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
987; GFX9-FLATSCR-NEXT:    ; implicit-def: $vgpr1
988; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
989; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB7_2
990; GFX9-FLATSCR-NEXT:  ; %bb.1:
991; GFX9-FLATSCR-NEXT:    s_load_dword s6, s[4:5], 0x2c
992; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX9-FLATSCR-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
994; GFX9-FLATSCR-NEXT:    s_mul_i32 s2, s2, 5
995; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, s2
996; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, s6
997; GFX9-FLATSCR-NEXT:    ds_add_rtn_u32 v1, v1, v2
998; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX9-FLATSCR-NEXT:  .LBB7_2:
1000; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
1001; GFX9-FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1002; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s2, v1
1004; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
1005; GFX9-FLATSCR-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1006; GFX9-FLATSCR-NEXT:    global_store_dword v2, v0, s[0:1]
1007; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1008; GFX9-FLATSCR-NEXT:    s_endpgm
1009;
1010; GFX11-LABEL: atomic_add_ret_local:
1011; GFX11:       ; %bb.0:
1012; GFX11-NEXT:    s_mov_b32 s1, exec_lo
1013; GFX11-NEXT:    s_mov_b32 s0, exec_lo
1014; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1015; GFX11-NEXT:    ; implicit-def: $vgpr1
1016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1017; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
1018; GFX11-NEXT:    s_cbranch_execz .LBB7_2
1019; GFX11-NEXT:  ; %bb.1:
1020; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
1021; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX11-NEXT:    s_bcnt1_i32_b32 s1, s1
1023; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1024; GFX11-NEXT:    s_mul_i32 s1, s1, 5
1025; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2
1026; GFX11-NEXT:    ds_add_rtn_u32 v1, v1, v2
1027; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX11-NEXT:    buffer_gl0_inv
1029; GFX11-NEXT:  .LBB7_2:
1030; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1031; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1032; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1033; GFX11-NEXT:    v_readfirstlane_b32 s2, v1
1034; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1035; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1036; GFX11-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1037; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1038; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1039; GFX11-NEXT:    s_endpgm
1040;
1041; GFX12-LABEL: atomic_add_ret_local:
1042; GFX12:       ; %bb.0:
1043; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1044; GFX12-NEXT:    s_mov_b32 s0, exec_lo
1045; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1046; GFX12-NEXT:    ; implicit-def: $vgpr1
1047; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1048; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
1049; GFX12-NEXT:    s_cbranch_execz .LBB7_2
1050; GFX12-NEXT:  ; %bb.1:
1051; GFX12-NEXT:    s_load_b32 s2, s[4:5], 0x2c
1052; GFX12-NEXT:    s_wait_kmcnt 0x0
1053; GFX12-NEXT:    s_wait_alu 0xfffe
1054; GFX12-NEXT:    s_bcnt1_i32_b32 s1, s1
1055; GFX12-NEXT:    s_wait_alu 0xfffe
1056; GFX12-NEXT:    s_mul_i32 s1, s1, 5
1057; GFX12-NEXT:    s_wait_alu 0xfffe
1058; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2
1059; GFX12-NEXT:    ds_add_rtn_u32 v1, v1, v2
1060; GFX12-NEXT:    s_wait_dscnt 0x0
1061; GFX12-NEXT:    global_inv scope:SCOPE_SE
1062; GFX12-NEXT:  .LBB7_2:
1063; GFX12-NEXT:    s_wait_alu 0xfffe
1064; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1065; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1066; GFX12-NEXT:    s_wait_kmcnt 0x0
1067; GFX12-NEXT:    v_readfirstlane_b32 s2, v1
1068; GFX12-NEXT:    v_mov_b32_e32 v1, 0
1069; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1070; GFX12-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1071; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
1072; GFX12-NEXT:    s_wait_storecnt 0x0
1073; GFX12-NEXT:    s_endpgm
1074  %val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
1075  store i32 %val, ptr addrspace(1) %out
1076  ret void
1077}
1078
1079declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
1080
1081; from atomic_optimizations_buffer.ll
1082; covers buffer_atomic (atomic with return)
1083;
1084define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1085; GFX9-LABEL: add_i32_constant:
1086; GFX9:       ; %bb.0: ; %entry
1087; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1088; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1089; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1090; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1091; GFX9-NEXT:    ; implicit-def: $vgpr1
1092; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1093; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1094; GFX9-NEXT:  ; %bb.1:
1095; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1096; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1098; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1099; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1100; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
1101; GFX9-NEXT:    s_waitcnt vmcnt(0)
1102; GFX9-NEXT:  .LBB8_2:
1103; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1104; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1107; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1108; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1109; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1110; GFX9-NEXT:    s_waitcnt vmcnt(0)
1111; GFX9-NEXT:    s_endpgm
1112;
1113; GFX90A-LABEL: add_i32_constant:
1114; GFX90A:       ; %bb.0: ; %entry
1115; GFX90A-NEXT:    s_mov_b64 s[2:3], exec
1116; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1117; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1118; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1119; GFX90A-NEXT:    ; implicit-def: $vgpr1
1120; GFX90A-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1121; GFX90A-NEXT:    s_cbranch_execz .LBB8_2
1122; GFX90A-NEXT:  ; %bb.1:
1123; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1124; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX90A-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1126; GFX90A-NEXT:    s_mul_i32 s2, s2, 5
1127; GFX90A-NEXT:    v_mov_b32_e32 v1, s2
1128; GFX90A-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
1129; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1130; GFX90A-NEXT:  .LBB8_2:
1131; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
1132; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1133; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX90A-NEXT:    v_readfirstlane_b32 s2, v1
1135; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1136; GFX90A-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1137; GFX90A-NEXT:    global_store_dword v2, v0, s[0:1]
1138; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1139; GFX90A-NEXT:    s_endpgm
1140;
1141; GFX10-LABEL: add_i32_constant:
1142; GFX10:       ; %bb.0: ; %entry
1143; GFX10-NEXT:    s_mov_b32 s1, exec_lo
1144; GFX10-NEXT:    ; implicit-def: $vgpr1
1145; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1146; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1147; GFX10-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1148; GFX10-NEXT:    s_cbranch_execz .LBB8_2
1149; GFX10-NEXT:  ; %bb.1:
1150; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1151; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1152; GFX10-NEXT:    s_bcnt1_i32_b32 s1, s1
1153; GFX10-NEXT:    s_mul_i32 s1, s1, 5
1154; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1155; GFX10-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
1156; GFX10-NEXT:    s_waitcnt vmcnt(0)
1157; GFX10-NEXT:  .LBB8_2:
1158; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1159; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1160; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1161; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX10-NEXT:    v_readfirstlane_b32 s2, v1
1163; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1164; GFX10-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1165; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1166; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1167; GFX10-NEXT:    s_endpgm
1168;
1169; GFX9-FLATSCR-LABEL: add_i32_constant:
1170; GFX9-FLATSCR:       ; %bb.0: ; %entry
1171; GFX9-FLATSCR-NEXT:    s_mov_b64 s[2:3], exec
1172; GFX9-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1173; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1174; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1175; GFX9-FLATSCR-NEXT:    ; implicit-def: $vgpr1
1176; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1177; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB8_2
1178; GFX9-FLATSCR-NEXT:  ; %bb.1:
1179; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1180; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1181; GFX9-FLATSCR-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1182; GFX9-FLATSCR-NEXT:    s_mul_i32 s2, s2, 5
1183; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, s2
1184; GFX9-FLATSCR-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
1185; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1186; GFX9-FLATSCR-NEXT:  .LBB8_2:
1187; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
1188; GFX9-FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1189; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s2, v1
1191; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
1192; GFX9-FLATSCR-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1193; GFX9-FLATSCR-NEXT:    global_store_dword v2, v0, s[0:1]
1194; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1195; GFX9-FLATSCR-NEXT:    s_endpgm
1196;
1197; GFX11-LABEL: add_i32_constant:
1198; GFX11:       ; %bb.0: ; %entry
1199; GFX11-NEXT:    s_mov_b32 s1, exec_lo
1200; GFX11-NEXT:    s_mov_b32 s0, exec_lo
1201; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1202; GFX11-NEXT:    ; implicit-def: $vgpr1
1203; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1204; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
1205; GFX11-NEXT:    s_cbranch_execz .LBB8_2
1206; GFX11-NEXT:  ; %bb.1:
1207; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1208; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX11-NEXT:    s_bcnt1_i32_b32 s1, s1
1210; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1211; GFX11-NEXT:    s_mul_i32 s1, s1, 5
1212; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1213; GFX11-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
1214; GFX11-NEXT:    s_waitcnt vmcnt(0)
1215; GFX11-NEXT:  .LBB8_2:
1216; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1217; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1218; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1219; GFX11-NEXT:    v_readfirstlane_b32 s2, v1
1220; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1221; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1222; GFX11-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1223; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1224; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1225; GFX11-NEXT:    s_endpgm
1226;
1227; GFX12-LABEL: add_i32_constant:
1228; GFX12:       ; %bb.0: ; %entry
1229; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1230; GFX12-NEXT:    s_mov_b32 s0, exec_lo
1231; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1232; GFX12-NEXT:    ; implicit-def: $vgpr1
1233; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1234; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
1235; GFX12-NEXT:    s_cbranch_execz .LBB8_2
1236; GFX12-NEXT:  ; %bb.1:
1237; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1238; GFX12-NEXT:    s_wait_kmcnt 0x0
1239; GFX12-NEXT:    s_wait_alu 0xfffe
1240; GFX12-NEXT:    s_bcnt1_i32_b32 s1, s1
1241; GFX12-NEXT:    s_wait_alu 0xfffe
1242; GFX12-NEXT:    s_mul_i32 s1, s1, 5
1243; GFX12-NEXT:    s_wait_alu 0xfffe
1244; GFX12-NEXT:    v_mov_b32_e32 v1, s1
1245; GFX12-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1246; GFX12-NEXT:    s_wait_loadcnt 0x0
1247; GFX12-NEXT:  .LBB8_2:
1248; GFX12-NEXT:    s_wait_alu 0xfffe
1249; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1250; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1251; GFX12-NEXT:    s_wait_kmcnt 0x0
1252; GFX12-NEXT:    v_readfirstlane_b32 s2, v1
1253; GFX12-NEXT:    v_mov_b32_e32 v1, 0
1254; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1255; GFX12-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
1256; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
1257; GFX12-NEXT:    s_wait_storecnt 0x0
1258; GFX12-NEXT:    s_endpgm
1259entry:
1260  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
1261  store i32 %old, ptr addrspace(1) %out
1262  ret void
1263}
1264
1265declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32)
1266
1267; from llvm.amdgcn.image.load.a16.ll
1268; covers image_load
1269;
1270define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
1271; GFX9-LABEL: load.f32.1d:
1272; GFX9:       ; %bb.0: ; %main_body
1273; GFX9-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 unorm a16
1274; GFX9-NEXT:    s_waitcnt vmcnt(0)
1275; GFX9-NEXT:    ; return to shader part epilog
1276;
1277; GFX90A-LABEL: load.f32.1d:
1278; GFX90A:       ; %bb.0: ; %main_body
1279; GFX90A-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 unorm a16
1280; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1281; GFX90A-NEXT:    ; return to shader part epilog
1282;
1283; GFX10-LABEL: load.f32.1d:
1284; GFX10:       ; %bb.0: ; %main_body
1285; GFX10-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1286; GFX10-NEXT:    s_waitcnt vmcnt(0)
1287; GFX10-NEXT:    ; return to shader part epilog
1288;
1289; GFX9-FLATSCR-LABEL: load.f32.1d:
1290; GFX9-FLATSCR:       ; %bb.0: ; %main_body
1291; GFX9-FLATSCR-NEXT:    s_mov_b32 s11, s9
1292; GFX9-FLATSCR-NEXT:    s_mov_b32 s10, s8
1293; GFX9-FLATSCR-NEXT:    s_mov_b32 s9, s7
1294; GFX9-FLATSCR-NEXT:    s_mov_b32 s8, s6
1295; GFX9-FLATSCR-NEXT:    s_mov_b32 s7, s5
1296; GFX9-FLATSCR-NEXT:    s_mov_b32 s6, s4
1297; GFX9-FLATSCR-NEXT:    s_mov_b32 s5, s3
1298; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, s2
1299; GFX9-FLATSCR-NEXT:    image_load v0, v0, s[4:11] dmask:0x1 unorm a16
1300; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1301; GFX9-FLATSCR-NEXT:    ; return to shader part epilog
1302;
1303; GFX11-LABEL: load.f32.1d:
1304; GFX11:       ; %bb.0: ; %main_body
1305; GFX11-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1306; GFX11-NEXT:    s_waitcnt vmcnt(0)
1307; GFX11-NEXT:    ; return to shader part epilog
1308;
1309; GFX12-LABEL: load.f32.1d:
1310; GFX12:       ; %bb.0: ; %main_body
1311; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16
1312; GFX12-NEXT:    s_wait_loadcnt 0x0
1313; GFX12-NEXT:    ; return to shader part epilog
1314main_body:
1315  %x = extractelement <2 x i16> %coords, i32 0
1316  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
1317  ret <4 x float> %v
1318}
1319
1320declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32)
1321
1322; from llvm.amdgcn.image.store.a16.ll
1323; covers image_store
1324;
1325define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
1326; GFX9-LABEL: store_f32_1d:
1327; GFX9:       ; %bb.0: ; %main_body
1328; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
1329; GFX9-NEXT:    s_waitcnt vmcnt(0)
1330; GFX9-NEXT:    s_endpgm
1331;
1332; GFX90A-LABEL: store_f32_1d:
1333; GFX90A:       ; %bb.0: ; %main_body
1334; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
1335; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
1336; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
1337; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
1338; GFX90A-NEXT:    image_store v[2:5], v0, s[0:7] dmask:0x1 unorm a16
1339; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1340; GFX90A-NEXT:    s_endpgm
1341;
1342; GFX10-LABEL: store_f32_1d:
1343; GFX10:       ; %bb.0: ; %main_body
1344; GFX10-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1345; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1346; GFX10-NEXT:    s_endpgm
1347;
1348; GFX9-FLATSCR-LABEL: store_f32_1d:
1349; GFX9-FLATSCR:       ; %bb.0: ; %main_body
1350; GFX9-FLATSCR-NEXT:    s_mov_b32 s11, s9
1351; GFX9-FLATSCR-NEXT:    s_mov_b32 s10, s8
1352; GFX9-FLATSCR-NEXT:    s_mov_b32 s9, s7
1353; GFX9-FLATSCR-NEXT:    s_mov_b32 s8, s6
1354; GFX9-FLATSCR-NEXT:    s_mov_b32 s7, s5
1355; GFX9-FLATSCR-NEXT:    s_mov_b32 s6, s4
1356; GFX9-FLATSCR-NEXT:    s_mov_b32 s5, s3
1357; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, s2
1358; GFX9-FLATSCR-NEXT:    image_store v[1:4], v0, s[4:11] dmask:0x1 unorm a16
1359; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1360; GFX9-FLATSCR-NEXT:    s_endpgm
1361;
1362; GFX11-LABEL: store_f32_1d:
1363; GFX11:       ; %bb.0: ; %main_body
1364; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1365; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1366; GFX11-NEXT:    s_endpgm
1367;
1368; GFX12-LABEL: store_f32_1d:
1369; GFX12:       ; %bb.0: ; %main_body
1370; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16
1371; GFX12-NEXT:    s_wait_storecnt 0x0
1372; GFX12-NEXT:    s_endpgm
1373
1374main_body:
1375  %x = extractelement <2 x i16> %coords, i32 0
1376  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
1377  ret void
1378}
1379
1380declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32)
1381
1382; from llvm.amdgcn.image.atomic.dim.ll
1383; covers image_atomic (atomic with return)
1384;
1385define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
1386; GFX9-LABEL: atomic_swap_1d:
1387; GFX9:       ; %bb.0: ; %main_body
1388; GFX9-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc
1389; GFX9-NEXT:    s_waitcnt vmcnt(0)
1390; GFX9-NEXT:    ; return to shader part epilog
1391;
1392; GFX90A-LABEL: atomic_swap_1d:
1393; GFX90A:       ; %bb.0: ; %main_body
1394; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
1395; GFX90A-NEXT:    image_atomic_swap v0, v2, s[0:7] dmask:0x1 unorm glc
1396; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1397; GFX90A-NEXT:    ; return to shader part epilog
1398;
1399; GFX10-LABEL: atomic_swap_1d:
1400; GFX10:       ; %bb.0: ; %main_body
1401; GFX10-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
1402; GFX10-NEXT:    s_waitcnt vmcnt(0)
1403; GFX10-NEXT:    ; return to shader part epilog
1404;
1405; GFX9-FLATSCR-LABEL: atomic_swap_1d:
1406; GFX9-FLATSCR:       ; %bb.0: ; %main_body
1407; GFX9-FLATSCR-NEXT:    s_mov_b32 s11, s9
1408; GFX9-FLATSCR-NEXT:    s_mov_b32 s10, s8
1409; GFX9-FLATSCR-NEXT:    s_mov_b32 s9, s7
1410; GFX9-FLATSCR-NEXT:    s_mov_b32 s8, s6
1411; GFX9-FLATSCR-NEXT:    s_mov_b32 s7, s5
1412; GFX9-FLATSCR-NEXT:    s_mov_b32 s6, s4
1413; GFX9-FLATSCR-NEXT:    s_mov_b32 s5, s3
1414; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, s2
1415; GFX9-FLATSCR-NEXT:    image_atomic_swap v0, v1, s[4:11] dmask:0x1 unorm glc
1416; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1417; GFX9-FLATSCR-NEXT:    ; return to shader part epilog
1418;
1419; GFX11-LABEL: atomic_swap_1d:
1420; GFX11:       ; %bb.0: ; %main_body
1421; GFX11-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
1422; GFX11-NEXT:    s_waitcnt vmcnt(0)
1423; GFX11-NEXT:    ; return to shader part epilog
1424;
1425; GFX12-LABEL: atomic_swap_1d:
1426; GFX12:       ; %bb.0: ; %main_body
1427; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
1428; GFX12-NEXT:    s_wait_loadcnt 0x0
1429; GFX12-NEXT:    ; return to shader part epilog
1430main_body:
1431  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
1432  %out = bitcast i32 %v to float
1433  ret float %out
1434}
1435
1436; from lds-bounds.ll
1437; covers ds_write_b64 (atomic without return)
1438@compute_lds = external addrspace(3) global [512 x i32], align 16
1439;
1440define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 {
1441; GFX9-LABEL: store_aligned:
1442; GFX9:       ; %bb.0: ; %entry
1443; GFX9-NEXT:    v_mov_b32_e32 v1, 42
1444; GFX9-NEXT:    v_mov_b32_e32 v2, 43
1445; GFX9-NEXT:    ds_write_b64 v0, v[1:2]
1446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1447; GFX9-NEXT:    s_endpgm
1448;
1449; GFX90A-LABEL: store_aligned:
1450; GFX90A:       ; %bb.0: ; %entry
1451; GFX90A-NEXT:    v_mov_b32_e32 v2, 42
1452; GFX90A-NEXT:    v_mov_b32_e32 v3, 43
1453; GFX90A-NEXT:    ds_write_b64 v0, v[2:3]
1454; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1455; GFX90A-NEXT:    s_endpgm
1456;
1457; GFX10-LABEL: store_aligned:
1458; GFX10:       ; %bb.0: ; %entry
1459; GFX10-NEXT:    v_mov_b32_e32 v1, 42
1460; GFX10-NEXT:    v_mov_b32_e32 v2, 43
1461; GFX10-NEXT:    ds_write_b64 v0, v[1:2]
1462; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1463; GFX10-NEXT:    s_endpgm
1464;
1465; GFX9-FLATSCR-LABEL: store_aligned:
1466; GFX9-FLATSCR:       ; %bb.0: ; %entry
1467; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 42
1468; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 43
1469; GFX9-FLATSCR-NEXT:    ds_write_b64 v0, v[1:2]
1470; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1471; GFX9-FLATSCR-NEXT:    s_endpgm
1472;
1473; GFX11-LABEL: store_aligned:
1474; GFX11:       ; %bb.0: ; %entry
1475; GFX11-NEXT:    v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43
1476; GFX11-NEXT:    ds_store_b64 v0, v[1:2]
1477; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX11-NEXT:    s_endpgm
1479;
1480; GFX12-LABEL: store_aligned:
1481; GFX12:       ; %bb.0: ; %entry
1482; GFX12-NEXT:    v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43
1483; GFX12-NEXT:    ds_store_b64 v0, v[1:2]
1484; GFX12-NEXT:    s_wait_dscnt 0x0
1485; GFX12-NEXT:    s_endpgm
1486entry:
1487  %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
1488
1489  store i32 42, ptr addrspace(3) %ptr, align 8
1490  store i32 43, ptr addrspace(3) %ptr.gep.1
1491  ret void
1492}
1493
1494
1495; from lds-bounds.ll
1496; covers ds_read_b64
1497;
1498define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 {
1499; GFX9-LABEL: load_aligned:
1500; GFX9:       ; %bb.0: ; %entry
1501; GFX9-NEXT:    ds_read_b64 v[0:1], v0
1502; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX9-NEXT:    ; return to shader part epilog
1504;
1505; GFX90A-LABEL: load_aligned:
1506; GFX90A:       ; %bb.0: ; %entry
1507; GFX90A-NEXT:    ds_read_b64 v[0:1], v0
1508; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX90A-NEXT:    ; return to shader part epilog
1510;
1511; GFX10-LABEL: load_aligned:
1512; GFX10:       ; %bb.0: ; %entry
1513; GFX10-NEXT:    ds_read_b64 v[0:1], v0
1514; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-NEXT:    ; return to shader part epilog
1516;
1517; GFX9-FLATSCR-LABEL: load_aligned:
1518; GFX9-FLATSCR:       ; %bb.0: ; %entry
1519; GFX9-FLATSCR-NEXT:    ds_read_b64 v[0:1], v0
1520; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1521; GFX9-FLATSCR-NEXT:    ; return to shader part epilog
1522;
1523; GFX11-LABEL: load_aligned:
1524; GFX11:       ; %bb.0: ; %entry
1525; GFX11-NEXT:    ds_load_b64 v[0:1], v0
1526; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX11-NEXT:    ; return to shader part epilog
1528;
1529; GFX12-LABEL: load_aligned:
1530; GFX12:       ; %bb.0: ; %entry
1531; GFX12-NEXT:    ds_load_b64 v[0:1], v0
1532; GFX12-NEXT:    s_wait_dscnt 0x0
1533; GFX12-NEXT:    ; return to shader part epilog
1534entry:
1535  %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
1536
1537  %v.0 = load i32, ptr addrspace(3) %ptr, align 8
1538  %v.1 = load i32, ptr addrspace(3) %ptr.gep.1
1539
1540  %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0
1541  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
1542  %bc = bitcast <2 x i32> %r.1 to <2 x float>
1543  ret <2 x float> %bc
1544}
1545
1546; from lds-bounds.ll
1547; covers ds_write2_b32
1548;
1549define amdgpu_cs void @store_global_const_idx() #0 {
1550; GFX9-LABEL: store_global_const_idx:
1551; GFX9:       ; %bb.0: ; %entry
1552; GFX9-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1553; GFX9-NEXT:    v_mov_b32_e32 v1, 42
1554; GFX9-NEXT:    v_mov_b32_e32 v2, 43
1555; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1557; GFX9-NEXT:    s_endpgm
1558;
1559; GFX90A-LABEL: store_global_const_idx:
1560; GFX90A:       ; %bb.0: ; %entry
1561; GFX90A-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1562; GFX90A-NEXT:    v_mov_b32_e32 v1, 42
1563; GFX90A-NEXT:    v_mov_b32_e32 v2, 43
1564; GFX90A-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1565; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1566; GFX90A-NEXT:    s_endpgm
1567;
1568; GFX10-LABEL: store_global_const_idx:
1569; GFX10:       ; %bb.0: ; %entry
1570; GFX10-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1571; GFX10-NEXT:    v_mov_b32_e32 v1, 42
1572; GFX10-NEXT:    v_mov_b32_e32 v2, 43
1573; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1574; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX10-NEXT:    s_endpgm
1576;
1577; GFX9-FLATSCR-LABEL: store_global_const_idx:
1578; GFX9-FLATSCR:       ; %bb.0: ; %entry
1579; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1580; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 42
1581; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 43
1582; GFX9-FLATSCR-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1583; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX9-FLATSCR-NEXT:    s_endpgm
1585;
1586; GFX11-LABEL: store_global_const_idx:
1587; GFX11:       ; %bb.0: ; %entry
1588; GFX11-NEXT:    v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42
1589; GFX11-NEXT:    v_mov_b32_e32 v2, 43
1590; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4
1591; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1592; GFX11-NEXT:    s_endpgm
1593;
1594; GFX12-LABEL: store_global_const_idx:
1595; GFX12:       ; %bb.0: ; %entry
1596; GFX12-NEXT:    v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42
1597; GFX12-NEXT:    v_mov_b32_e32 v2, 43
1598; GFX12-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4
1599; GFX12-NEXT:    s_wait_dscnt 0x0
1600; GFX12-NEXT:    s_endpgm
1601entry:
1602  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
1603  %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
1604
1605  store i32 42, ptr addrspace(3) %ptr.a
1606  store i32 43, ptr addrspace(3) %ptr.b
1607  ret void
1608}
1609
1610; from lds-bounds.ll
1611; covers ds_read2_b32
1612;
1613define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
1614; GFX9-LABEL: load_global_const_idx:
1615; GFX9:       ; %bb.0: ; %entry
1616; GFX9-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1617; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX9-NEXT:    ; return to shader part epilog
1620;
1621; GFX90A-LABEL: load_global_const_idx:
1622; GFX90A:       ; %bb.0: ; %entry
1623; GFX90A-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1624; GFX90A-NEXT:    ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1625; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX90A-NEXT:    ; return to shader part epilog
1627;
1628; GFX10-LABEL: load_global_const_idx:
1629; GFX10:       ; %bb.0: ; %entry
1630; GFX10-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1631; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1632; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1633; GFX10-NEXT:    ; return to shader part epilog
1634;
1635; GFX9-FLATSCR-LABEL: load_global_const_idx:
1636; GFX9-FLATSCR:       ; %bb.0: ; %entry
1637; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1638; GFX9-FLATSCR-NEXT:    ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1639; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
1640; GFX9-FLATSCR-NEXT:    ; return to shader part epilog
1641;
1642; GFX11-LABEL: load_global_const_idx:
1643; GFX11:       ; %bb.0: ; %entry
1644; GFX11-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1645; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4
1646; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1647; GFX11-NEXT:    ; return to shader part epilog
1648;
1649; GFX12-LABEL: load_global_const_idx:
1650; GFX12:       ; %bb.0: ; %entry
1651; GFX12-NEXT:    v_mov_b32_e32 v0, compute_lds@abs32@lo
1652; GFX12-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4
1653; GFX12-NEXT:    s_wait_dscnt 0x0
1654; GFX12-NEXT:    ; return to shader part epilog
1655entry:
1656  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
1657  %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
1658
1659  %v.0 = load i32, ptr addrspace(3) %ptr.a
1660  %v.1 = load i32, ptr addrspace(3) %ptr.b
1661
1662  %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0
1663  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
1664  %bc = bitcast <2 x i32> %r.1 to <2 x float>
1665  ret <2 x float> %bc
1666}
1667