xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
5
6@gv = external addrspace(4) constant i32
7
8define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
9; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
12; GFX9-NEXT:    s_add_u32 s0, s0, s17
13; GFX9-NEXT:    s_movk_i32 s32, 0x400
14; GFX9-NEXT:    s_addc_u32 s1, s1, 0
15; GFX9-NEXT:    s_mov_b32 s4, s32
16; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX9-NEXT:    s_lshl2_add_u32 s5, s5, 15
18; GFX9-NEXT:    s_and_b32 s5, s5, -16
19; GFX9-NEXT:    v_mov_b32_e32 v0, 0
20; GFX9-NEXT:    v_mov_b32_e32 v1, s4
21; GFX9-NEXT:    s_lshl_b32 s5, s5, 6
22; GFX9-NEXT:    s_mov_b32 s33, 0
23; GFX9-NEXT:    s_add_u32 s32, s4, s5
24; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
25; GFX9-NEXT:    s_endpgm
26;
27; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
28; GFX10:       ; %bb.0:
29; GFX10-NEXT:    s_load_dword s5, s[8:9], 0x0
30; GFX10-NEXT:    s_movk_i32 s32, 0x200
31; GFX10-NEXT:    s_add_u32 s0, s0, s17
32; GFX10-NEXT:    s_mov_b32 s4, s32
33; GFX10-NEXT:    s_addc_u32 s1, s1, 0
34; GFX10-NEXT:    v_mov_b32_e32 v0, 0
35; GFX10-NEXT:    v_mov_b32_e32 v1, s4
36; GFX10-NEXT:    s_mov_b32 s33, 0
37; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
38; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX10-NEXT:    s_lshl2_add_u32 s5, s5, 15
40; GFX10-NEXT:    s_and_b32 s5, s5, -16
41; GFX10-NEXT:    s_lshl_b32 s5, s5, 5
42; GFX10-NEXT:    s_add_u32 s32, s4, s5
43; GFX10-NEXT:    s_endpgm
44;
45; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
46; GFX11:       ; %bb.0:
47; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x0
48; GFX11-NEXT:    v_mov_b32_e32 v0, 0
49; GFX11-NEXT:    s_mov_b32 s32, 16
50; GFX11-NEXT:    s_mov_b32 s33, 0
51; GFX11-NEXT:    s_mov_b32 s0, s32
52; GFX11-NEXT:    scratch_store_b32 off, v0, s0
53; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX11-NEXT:    s_lshl2_add_u32 s1, s1, 15
55; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
56; GFX11-NEXT:    s_and_b32 s1, s1, -16
57; GFX11-NEXT:    s_lshl_b32 s1, s1, 5
58; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
59; GFX11-NEXT:    s_add_u32 s32, s0, s1
60; GFX11-NEXT:    s_endpgm
61  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
62  store i32 0, ptr addrspace(5) %alloca
63  ret void
64}
65
66define void @func_dynamic_stackalloc_sgpr_align4() {
67; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
68; GFX9:       ; %bb.0:
69; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX9-NEXT:    s_mov_b32 s7, s33
71; GFX9-NEXT:    s_mov_b32 s33, s32
72; GFX9-NEXT:    s_addk_i32 s32, 0x400
73; GFX9-NEXT:    s_getpc_b64 s[4:5]
74; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
75; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
76; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
77; GFX9-NEXT:    s_mov_b32 s6, s32
78; GFX9-NEXT:    v_mov_b32_e32 v0, 0
79; GFX9-NEXT:    v_mov_b32_e32 v1, s6
80; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
81; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
83; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
85; GFX9-NEXT:    s_and_b32 s4, s4, -16
86; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
87; GFX9-NEXT:    s_add_u32 s32, s6, s4
88; GFX9-NEXT:    s_mov_b32 s32, s33
89; GFX9-NEXT:    s_mov_b32 s33, s7
90; GFX9-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NEXT:    s_setpc_b64 s[30:31]
92;
93; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
94; GFX10:       ; %bb.0:
95; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX10-NEXT:    s_mov_b32 s7, s33
97; GFX10-NEXT:    s_mov_b32 s33, s32
98; GFX10-NEXT:    s_addk_i32 s32, 0x200
99; GFX10-NEXT:    s_getpc_b64 s[4:5]
100; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
101; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
102; GFX10-NEXT:    s_mov_b32 s6, s32
103; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
104; GFX10-NEXT:    v_mov_b32_e32 v0, 0
105; GFX10-NEXT:    v_mov_b32_e32 v1, s6
106; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
107; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
109; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
110; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
111; GFX10-NEXT:    s_and_b32 s4, s4, -16
112; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
113; GFX10-NEXT:    s_add_u32 s32, s6, s4
114; GFX10-NEXT:    s_mov_b32 s32, s33
115; GFX10-NEXT:    s_mov_b32 s33, s7
116; GFX10-NEXT:    s_setpc_b64 s[30:31]
117;
118; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
119; GFX11:       ; %bb.0:
120; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX11-NEXT:    s_mov_b32 s3, s33
122; GFX11-NEXT:    s_mov_b32 s33, s32
123; GFX11-NEXT:    s_add_i32 s32, s32, 16
124; GFX11-NEXT:    s_getpc_b64 s[0:1]
125; GFX11-NEXT:    s_add_u32 s0, s0, gv@gotpcrel32@lo+4
126; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
127; GFX11-NEXT:    v_mov_b32_e32 v0, 0
128; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
129; GFX11-NEXT:    s_mov_b32 s2, s32
130; GFX11-NEXT:    scratch_store_b32 off, v0, s2
131; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
133; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
135; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
136; GFX11-NEXT:    s_and_b32 s0, s0, -16
137; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
138; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
139; GFX11-NEXT:    s_add_u32 s32, s2, s0
140; GFX11-NEXT:    s_mov_b32 s32, s33
141; GFX11-NEXT:    s_mov_b32 s33, s3
142; GFX11-NEXT:    s_setpc_b64 s[30:31]
143  %n = load i32, ptr addrspace(4) @gv, align 4
144  %alloca = alloca i32, i32 %n, addrspace(5)
145  store i32 0, ptr addrspace(5) %alloca
146  ret void
147}
148
149define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
150; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
151; GFX9:       ; %bb.0:
152; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
153; GFX9-NEXT:    s_add_u32 s0, s0, s17
154; GFX9-NEXT:    s_movk_i32 s32, 0x400
155; GFX9-NEXT:    s_addc_u32 s1, s1, 0
156; GFX9-NEXT:    s_mov_b32 s4, s32
157; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX9-NEXT:    s_lshl2_add_u32 s5, s5, 15
159; GFX9-NEXT:    s_and_b32 s5, s5, -16
160; GFX9-NEXT:    v_mov_b32_e32 v0, 0
161; GFX9-NEXT:    v_mov_b32_e32 v1, s4
162; GFX9-NEXT:    s_lshl_b32 s5, s5, 6
163; GFX9-NEXT:    s_mov_b32 s33, 0
164; GFX9-NEXT:    s_add_u32 s32, s4, s5
165; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
166; GFX9-NEXT:    s_endpgm
167;
168; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
169; GFX10:       ; %bb.0:
170; GFX10-NEXT:    s_load_dword s5, s[8:9], 0x0
171; GFX10-NEXT:    s_movk_i32 s32, 0x200
172; GFX10-NEXT:    s_add_u32 s0, s0, s17
173; GFX10-NEXT:    s_mov_b32 s4, s32
174; GFX10-NEXT:    s_addc_u32 s1, s1, 0
175; GFX10-NEXT:    v_mov_b32_e32 v0, 0
176; GFX10-NEXT:    v_mov_b32_e32 v1, s4
177; GFX10-NEXT:    s_mov_b32 s33, 0
178; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
179; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX10-NEXT:    s_lshl2_add_u32 s5, s5, 15
181; GFX10-NEXT:    s_and_b32 s5, s5, -16
182; GFX10-NEXT:    s_lshl_b32 s5, s5, 5
183; GFX10-NEXT:    s_add_u32 s32, s4, s5
184; GFX10-NEXT:    s_endpgm
185;
186; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
187; GFX11:       ; %bb.0:
188; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x0
189; GFX11-NEXT:    v_mov_b32_e32 v0, 0
190; GFX11-NEXT:    s_mov_b32 s32, 16
191; GFX11-NEXT:    s_mov_b32 s33, 0
192; GFX11-NEXT:    s_mov_b32 s0, s32
193; GFX11-NEXT:    scratch_store_b32 off, v0, s0
194; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX11-NEXT:    s_lshl2_add_u32 s1, s1, 15
196; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
197; GFX11-NEXT:    s_and_b32 s1, s1, -16
198; GFX11-NEXT:    s_lshl_b32 s1, s1, 5
199; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
200; GFX11-NEXT:    s_add_u32 s32, s0, s1
201; GFX11-NEXT:    s_endpgm
202  %alloca = alloca i32, i32 %n, align 16, addrspace(5)
203  store i32 0, ptr addrspace(5) %alloca
204  ret void
205}
206
207define void @func_dynamic_stackalloc_sgpr_align16() {
208; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
209; GFX9:       ; %bb.0:
210; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX9-NEXT:    s_mov_b32 s7, s33
212; GFX9-NEXT:    s_mov_b32 s33, s32
213; GFX9-NEXT:    s_addk_i32 s32, 0x400
214; GFX9-NEXT:    s_getpc_b64 s[4:5]
215; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
216; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
217; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
218; GFX9-NEXT:    s_mov_b32 s6, s32
219; GFX9-NEXT:    v_mov_b32_e32 v0, 0
220; GFX9-NEXT:    v_mov_b32_e32 v1, s6
221; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
222; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
224; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
226; GFX9-NEXT:    s_and_b32 s4, s4, -16
227; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
228; GFX9-NEXT:    s_add_u32 s32, s6, s4
229; GFX9-NEXT:    s_mov_b32 s32, s33
230; GFX9-NEXT:    s_mov_b32 s33, s7
231; GFX9-NEXT:    s_waitcnt vmcnt(0)
232; GFX9-NEXT:    s_setpc_b64 s[30:31]
233;
234; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
235; GFX10:       ; %bb.0:
236; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; GFX10-NEXT:    s_mov_b32 s7, s33
238; GFX10-NEXT:    s_mov_b32 s33, s32
239; GFX10-NEXT:    s_addk_i32 s32, 0x200
240; GFX10-NEXT:    s_getpc_b64 s[4:5]
241; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
242; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
243; GFX10-NEXT:    s_mov_b32 s6, s32
244; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
245; GFX10-NEXT:    v_mov_b32_e32 v0, 0
246; GFX10-NEXT:    v_mov_b32_e32 v1, s6
247; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
248; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
250; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
252; GFX10-NEXT:    s_and_b32 s4, s4, -16
253; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
254; GFX10-NEXT:    s_add_u32 s32, s6, s4
255; GFX10-NEXT:    s_mov_b32 s32, s33
256; GFX10-NEXT:    s_mov_b32 s33, s7
257; GFX10-NEXT:    s_setpc_b64 s[30:31]
258;
259; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
260; GFX11:       ; %bb.0:
261; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GFX11-NEXT:    s_mov_b32 s3, s33
263; GFX11-NEXT:    s_mov_b32 s33, s32
264; GFX11-NEXT:    s_add_i32 s32, s32, 16
265; GFX11-NEXT:    s_getpc_b64 s[0:1]
266; GFX11-NEXT:    s_add_u32 s0, s0, gv@gotpcrel32@lo+4
267; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
268; GFX11-NEXT:    v_mov_b32_e32 v0, 0
269; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
270; GFX11-NEXT:    s_mov_b32 s2, s32
271; GFX11-NEXT:    scratch_store_b32 off, v0, s2
272; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
274; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
276; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
277; GFX11-NEXT:    s_and_b32 s0, s0, -16
278; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
279; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
280; GFX11-NEXT:    s_add_u32 s32, s2, s0
281; GFX11-NEXT:    s_mov_b32 s32, s33
282; GFX11-NEXT:    s_mov_b32 s33, s3
283; GFX11-NEXT:    s_setpc_b64 s[30:31]
284  %n = load i32, ptr addrspace(4) @gv, align 16
285  %alloca = alloca i32, i32 %n, addrspace(5)
286  store i32 0, ptr addrspace(5) %alloca
287  ret void
288}
289
290define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
291; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
292; GFX9:       ; %bb.0:
293; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
294; GFX9-NEXT:    s_movk_i32 s32, 0x800
295; GFX9-NEXT:    s_add_u32 s0, s0, s17
296; GFX9-NEXT:    s_addc_u32 s1, s1, 0
297; GFX9-NEXT:    s_add_u32 s5, s32, 0x7ff
298; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
300; GFX9-NEXT:    s_and_b32 s5, s5, 0xfffff800
301; GFX9-NEXT:    s_and_b32 s4, s4, -16
302; GFX9-NEXT:    v_mov_b32_e32 v0, 0
303; GFX9-NEXT:    v_mov_b32_e32 v1, s5
304; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
305; GFX9-NEXT:    s_mov_b32 s33, 0
306; GFX9-NEXT:    s_add_u32 s32, s5, s4
307; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
308; GFX9-NEXT:    s_endpgm
309;
310; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
311; GFX10:       ; %bb.0:
312; GFX10-NEXT:    s_load_dword s4, s[8:9], 0x0
313; GFX10-NEXT:    s_movk_i32 s32, 0x400
314; GFX10-NEXT:    s_add_u32 s0, s0, s17
315; GFX10-NEXT:    s_addc_u32 s1, s1, 0
316; GFX10-NEXT:    s_add_u32 s5, s32, 0x3ff
317; GFX10-NEXT:    v_mov_b32_e32 v0, 0
318; GFX10-NEXT:    s_and_b32 s5, s5, 0xfffffc00
319; GFX10-NEXT:    s_mov_b32 s33, 0
320; GFX10-NEXT:    v_mov_b32_e32 v1, s5
321; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
322; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
324; GFX10-NEXT:    s_and_b32 s4, s4, -16
325; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
326; GFX10-NEXT:    s_add_u32 s32, s5, s4
327; GFX10-NEXT:    s_endpgm
328;
329; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
330; GFX11:       ; %bb.0:
331; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
332; GFX11-NEXT:    s_mov_b32 s32, 32
333; GFX11-NEXT:    v_mov_b32_e32 v0, 0
334; GFX11-NEXT:    s_add_u32 s1, s32, 0x3ff
335; GFX11-NEXT:    s_mov_b32 s33, 0
336; GFX11-NEXT:    s_and_b32 s1, s1, 0xfffffc00
337; GFX11-NEXT:    scratch_store_b32 off, v0, s1
338; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
340; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
341; GFX11-NEXT:    s_and_b32 s0, s0, -16
342; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
343; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
344; GFX11-NEXT:    s_add_u32 s32, s1, s0
345; GFX11-NEXT:    s_endpgm
346  %alloca = alloca i32, i32 %n, align 32, addrspace(5)
347  store i32 0, ptr addrspace(5) %alloca
348  ret void
349}
350
351define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
352; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32:
353; GFX9:       ; %bb.0:
354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355; GFX9-NEXT:    s_mov_b32 s6, s33
356; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
357; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
358; GFX9-NEXT:    s_mov_b32 s7, s34
359; GFX9-NEXT:    s_mov_b32 s34, s32
360; GFX9-NEXT:    s_addk_i32 s32, 0x1000
361; GFX9-NEXT:    s_getpc_b64 s[4:5]
362; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
363; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
364; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
365; GFX9-NEXT:    v_mov_b32_e32 v0, 0
366; GFX9-NEXT:    s_mov_b32 s33, s6
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
369; GFX9-NEXT:    s_add_u32 s5, s32, 0x7ff
370; GFX9-NEXT:    s_and_b32 s5, s5, 0xfffff800
371; GFX9-NEXT:    v_mov_b32_e32 v1, s5
372; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
375; GFX9-NEXT:    s_and_b32 s4, s4, -16
376; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
377; GFX9-NEXT:    s_add_u32 s32, s5, s4
378; GFX9-NEXT:    s_mov_b32 s32, s34
379; GFX9-NEXT:    s_mov_b32 s34, s7
380; GFX9-NEXT:    s_waitcnt vmcnt(0)
381; GFX9-NEXT:    s_setpc_b64 s[30:31]
382;
383; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32:
384; GFX10:       ; %bb.0:
385; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
386; GFX10-NEXT:    s_mov_b32 s6, s33
387; GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
388; GFX10-NEXT:    s_mov_b32 s7, s34
389; GFX10-NEXT:    s_and_b32 s33, s33, 0xfffffc00
390; GFX10-NEXT:    s_mov_b32 s34, s32
391; GFX10-NEXT:    s_addk_i32 s32, 0x800
392; GFX10-NEXT:    s_getpc_b64 s[4:5]
393; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
394; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
395; GFX10-NEXT:    v_mov_b32_e32 v0, 0
396; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
397; GFX10-NEXT:    s_mov_b32 s33, s6
398; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
400; GFX10-NEXT:    s_add_u32 s5, s32, 0x3ff
401; GFX10-NEXT:    s_and_b32 s5, s5, 0xfffffc00
402; GFX10-NEXT:    v_mov_b32_e32 v1, s5
403; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
404; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
406; GFX10-NEXT:    s_and_b32 s4, s4, -16
407; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
408; GFX10-NEXT:    s_add_u32 s32, s5, s4
409; GFX10-NEXT:    s_mov_b32 s32, s34
410; GFX10-NEXT:    s_mov_b32 s34, s7
411; GFX10-NEXT:    s_setpc_b64 s[30:31]
412;
413; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
414; GFX11:       ; %bb.0:
415; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416; GFX11-NEXT:    s_mov_b32 s2, s33
417; GFX11-NEXT:    s_add_i32 s33, s32, 31
418; GFX11-NEXT:    s_mov_b32 s3, s34
419; GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
420; GFX11-NEXT:    s_mov_b32 s34, s32
421; GFX11-NEXT:    s_add_i32 s32, s32, 64
422; GFX11-NEXT:    s_getpc_b64 s[0:1]
423; GFX11-NEXT:    s_add_u32 s0, s0, gv@gotpcrel32@lo+4
424; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
425; GFX11-NEXT:    v_mov_b32_e32 v0, 0
426; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
427; GFX11-NEXT:    s_mov_b32 s33, s2
428; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
430; GFX11-NEXT:    s_add_u32 s1, s32, 0x3ff
431; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
432; GFX11-NEXT:    s_and_b32 s1, s1, 0xfffffc00
433; GFX11-NEXT:    scratch_store_b32 off, v0, s1
434; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
436; GFX11-NEXT:    s_and_b32 s0, s0, -16
437; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
438; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
439; GFX11-NEXT:    s_add_u32 s32, s1, s0
440; GFX11-NEXT:    s_mov_b32 s32, s34
441; GFX11-NEXT:    s_mov_b32 s34, s3
442; GFX11-NEXT:    s_setpc_b64 s[30:31]
443  %n = load i32, ptr addrspace(4) @gv
444  %alloca = alloca i32, i32 %n, align 32, addrspace(5)
445  store i32 0, ptr addrspace(5) %alloca
446  ret void
447}
448