xref: /llvm-project/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE %s
3; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
4; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
5; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
6; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
7; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch | FileCheck -check-prefixes=FLATSCR,DEFAULTSIZE %s
8; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=FLATSCR,ASSUME1024 %s
9
10; FIXME: Generated test checks do not check metadata at the end of the
11; function, so this also includes manually added checks.
12
13; Test that we can select a statically sized alloca outside of the
14; entry block.
15
16; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
17; alignment less than the stack alignment.
18define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 {
19; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
20; MUBUF:       ; %bb.0: ; %entry
21; MUBUF-NEXT:    s_add_u32 s0, s0, s9
22; MUBUF-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
23; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
24; MUBUF-NEXT:    s_mov_b32 s33, 0
25; MUBUF-NEXT:    s_movk_i32 s32, 0x400
26; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
27; MUBUF-NEXT:    s_cmp_lg_u32 s8, 0
28; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
29; MUBUF-NEXT:  ; %bb.1: ; %bb.0
30; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
31; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
32; MUBUF-NEXT:  ; %bb.2: ; %bb.1
33; MUBUF-NEXT:    s_mov_b32 s6, s32
34; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
35; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
36; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
37; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
38; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s6
39; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
40; MUBUF-NEXT:    s_add_i32 s6, s6, s7
41; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
42; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
43; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
44; MUBUF-NEXT:    s_waitcnt vmcnt(0)
45; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
46; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
47; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
48; MUBUF-NEXT:  .LBB0_3: ; %bb.2
49; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
50; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
51; MUBUF-NEXT:    s_waitcnt vmcnt(0)
52; MUBUF-NEXT:    s_endpgm
53;
54; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
55; FLATSCR:       ; %bb.0: ; %entry
56; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
57; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
58; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
59; FLATSCR-NEXT:    s_mov_b32 s33, 0
60; FLATSCR-NEXT:    s_mov_b32 s32, 16
61; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
62; FLATSCR-NEXT:    s_cmp_lg_u32 s4, 0
63; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
64; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
65; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
66; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
67; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
68; FLATSCR-NEXT:    s_mov_b32 s2, s32
69; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
70; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
71; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
72; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
73; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
74; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
75; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
76; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
77; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
78; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
79; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
80; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
81; FLATSCR-NEXT:  .LBB0_3: ; %bb.2
82; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
83; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
84; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
85; FLATSCR-NEXT:    s_endpgm
86
87entry:
88  %cond0 = icmp eq i32 %arg.cond0, 0
89  br i1 %cond0, label %bb.0, label %bb.2
90
91bb.0:
92  %alloca = alloca [16 x i32], align 4, addrspace(5)
93  %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
94  %cond1 = icmp eq i32 %arg.cond1, 0
95  br i1 %cond1, label %bb.1, label %bb.2
96
97bb.1:
98  ; Use the alloca outside of the defining block.
99  store i32 0, ptr addrspace(5) %alloca
100  store i32 1, ptr addrspace(5) %gep1
101  %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
102  %load = load i32, ptr addrspace(5) %gep2
103  %tid = call i32 @llvm.amdgcn.workitem.id.x()
104  %add = add i32 %load, %tid
105  store i32 %add, ptr addrspace(1) %out
106  br label %bb.2
107
108bb.2:
109  store volatile i32 0, ptr addrspace(1) undef
110  ret void
111}
112; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
113; DEFAULTSIZE: ; ScratchSize: 4112
114; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
115; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
116; DEFAULTSIZE-V5: ; ScratchSize: 16
117
118; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
119; ASSUME1024: ; ScratchSize: 1040
120
121define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
122; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
123; MUBUF:       ; %bb.0: ; %entry
124; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
125; MUBUF-NEXT:    s_add_u32 s0, s0, s17
126; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
127; MUBUF-NEXT:    s_mov_b32 s33, 0
128; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
129; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
130; MUBUF-NEXT:    s_cmp_lg_u32 s4, 0
131; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_2
132; MUBUF-NEXT:  ; %bb.1: ; %bb.0
133; MUBUF-NEXT:    s_add_i32 s4, s32, 0xfff
134; MUBUF-NEXT:    s_and_b32 s4, s4, 0xfffff000
135; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
136; MUBUF-NEXT:    s_add_i32 s32, s4, 0x1000
137; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
138; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
139; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
140; MUBUF-NEXT:    s_add_i32 s4, s4, s5
141; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
142; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
143; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
144; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
145; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
146; MUBUF-NEXT:    s_waitcnt vmcnt(0)
147; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
148; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
149; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
150; MUBUF-NEXT:  .LBB1_2: ; %bb.1
151; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
152; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
153; MUBUF-NEXT:    s_waitcnt vmcnt(0)
154; MUBUF-NEXT:    s_endpgm
155;
156; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
157; FLATSCR:       ; %bb.0: ; %entry
158; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
159; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
160; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
161; FLATSCR-NEXT:    s_mov_b32 s33, 0
162; FLATSCR-NEXT:    s_mov_b32 s32, 64
163; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
164; FLATSCR-NEXT:    s_cmp_lg_u32 s0, 0
165; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_2
166; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
167; FLATSCR-NEXT:    s_add_i32 s0, s32, 0xfff
168; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
169; FLATSCR-NEXT:    s_and_b32 s0, s0, 0xfffff000
170; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
171; FLATSCR-NEXT:    s_lshl_b32 s1, s1, 2
172; FLATSCR-NEXT:    s_add_i32 s32, s0, 0x1000
173; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
174; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
175; FLATSCR-NEXT:    scratch_load_dword v2, off, s0
176; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
177; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
178; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
179; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
180; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
181; FLATSCR-NEXT:  .LBB1_2: ; %bb.1
182; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
183; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
184; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
185; FLATSCR-NEXT:    s_endpgm
186entry:
187  %cond = icmp eq i32 %arg.cond, 0
188  br i1 %cond, label %bb.0, label %bb.1
189
190bb.0:
191  %alloca = alloca [16 x i32], align 64, addrspace(5)
192  %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
193  store i32 0, ptr addrspace(5) %alloca
194  store i32 1, ptr addrspace(5) %gep1
195  %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
196  %load = load i32, ptr addrspace(5) %gep2
197  %tid = call i32 @llvm.amdgcn.workitem.id.x()
198  %add = add i32 %load, %tid
199  store i32 %add, ptr addrspace(1) %out
200  br label %bb.1
201
202bb.1:
203  store volatile i32 0, ptr addrspace(1) undef
204  ret void
205}
206
207; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
208; DEFAULTSIZE: ; ScratchSize: 4160
209; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
210; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
211; DEFAULTSIZE-V5: ; ScratchSize: 64
212
213; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
214; ASSUME1024: ; ScratchSize: 1088
215
216
217define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
218; MUBUF-LABEL: func_non_entry_block_static_alloca_align4:
219; MUBUF:       ; %bb.0: ; %entry
220; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221; MUBUF-NEXT:    s_mov_b32 s7, s33
222; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
223; MUBUF-NEXT:    s_mov_b32 s33, s32
224; MUBUF-NEXT:    s_addk_i32 s32, 0x400
225; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
226; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
227; MUBUF-NEXT:  ; %bb.1: ; %bb.0
228; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
229; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
230; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
231; MUBUF-NEXT:  ; %bb.2: ; %bb.1
232; MUBUF-NEXT:    s_mov_b32 s6, s32
233; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
234; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6
235; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
236; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
237; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
238; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
239; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
240; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
241; MUBUF-NEXT:    s_waitcnt vmcnt(0)
242; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
243; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
244; MUBUF-NEXT:  .LBB2_3: ; %bb.2
245; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
246; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
247; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
248; MUBUF-NEXT:    s_waitcnt vmcnt(0)
249; MUBUF-NEXT:    s_mov_b32 s32, s33
250; MUBUF-NEXT:    s_mov_b32 s33, s7
251; MUBUF-NEXT:    s_setpc_b64 s[30:31]
252;
253; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
254; FLATSCR:       ; %bb.0: ; %entry
255; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; FLATSCR-NEXT:    s_mov_b32 s3, s33
257; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
258; FLATSCR-NEXT:    s_mov_b32 s33, s32
259; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
260; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
261; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
262; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
263; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
264; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
265; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
266; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
267; FLATSCR-NEXT:    s_mov_b32 s2, s32
268; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
269; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
270; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
271; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s2
272; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
273; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
274; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
275; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
276; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
277; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
278; FLATSCR-NEXT:  .LBB2_3: ; %bb.2
279; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
280; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
281; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
282; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
283; FLATSCR-NEXT:    s_mov_b32 s32, s33
284; FLATSCR-NEXT:    s_mov_b32 s33, s3
285; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
286
287entry:
288  %cond0 = icmp eq i32 %arg.cond0, 0
289  br i1 %cond0, label %bb.0, label %bb.2
290
291bb.0:
292  %alloca = alloca [16 x i32], align 4, addrspace(5)
293  %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
294  %cond1 = icmp eq i32 %arg.cond1, 0
295  br i1 %cond1, label %bb.1, label %bb.2
296
297bb.1:
298  ; Use the alloca outside of the defining block.
299  store i32 0, ptr addrspace(5) %alloca
300  store i32 1, ptr addrspace(5) %gep1
301  %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
302  %load = load i32, ptr addrspace(5) %gep2
303  %tid = call i32 @llvm.amdgcn.workitem.id.x()
304  %add = add i32 %load, %tid
305  store i32 %add, ptr addrspace(1) %out
306  br label %bb.2
307
308bb.2:
309  store volatile i32 0, ptr addrspace(1) undef
310  ret void
311}
312
313define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
314; MUBUF-LABEL: func_non_entry_block_static_alloca_align64:
315; MUBUF:       ; %bb.0: ; %entry
316; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317; MUBUF-NEXT:    s_mov_b32 s7, s33
318; MUBUF-NEXT:    s_add_i32 s33, s32, 0xfc0
319; MUBUF-NEXT:    s_mov_b32 s8, s34
320; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
321; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfffff000
322; MUBUF-NEXT:    s_mov_b32 s34, s32
323; MUBUF-NEXT:    s_addk_i32 s32, 0x2000
324; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
325; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
326; MUBUF-NEXT:  ; %bb.1: ; %bb.0
327; MUBUF-NEXT:    s_add_i32 s6, s32, 0xfff
328; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
329; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
330; MUBUF-NEXT:    v_mov_b32_e32 v4, s6
331; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
332; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
333; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
334; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
335; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
336; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
337; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
338; MUBUF-NEXT:    s_waitcnt vmcnt(0)
339; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
340; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
341; MUBUF-NEXT:  .LBB3_2: ; %bb.1
342; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
343; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
344; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
345; MUBUF-NEXT:    s_waitcnt vmcnt(0)
346; MUBUF-NEXT:    s_mov_b32 s32, s34
347; MUBUF-NEXT:    s_mov_b32 s34, s8
348; MUBUF-NEXT:    s_mov_b32 s33, s7
349; MUBUF-NEXT:    s_setpc_b64 s[30:31]
350;
351; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
352; FLATSCR:       ; %bb.0: ; %entry
353; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; FLATSCR-NEXT:    s_mov_b32 s3, s33
355; FLATSCR-NEXT:    s_add_i32 s33, s32, 63
356; FLATSCR-NEXT:    s_mov_b32 s4, s34
357; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
358; FLATSCR-NEXT:    s_andn2_b32 s33, s33, 63
359; FLATSCR-NEXT:    s_mov_b32 s34, s32
360; FLATSCR-NEXT:    s_addk_i32 s32, 0x80
361; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
362; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
363; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
364; FLATSCR-NEXT:    s_add_i32 s2, s32, 0xfff
365; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
366; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0
367; FLATSCR-NEXT:    v_mov_b32_e32 v5, 1
368; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
369; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
370; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
371; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
372; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
373; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
374; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
375; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
376; FLATSCR-NEXT:  .LBB3_2: ; %bb.1
377; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
378; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
379; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
380; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
381; FLATSCR-NEXT:    s_mov_b32 s32, s34
382; FLATSCR-NEXT:    s_mov_b32 s34, s4
383; FLATSCR-NEXT:    s_mov_b32 s33, s3
384; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
385entry:
386  %cond = icmp eq i32 %arg.cond, 0
387  br i1 %cond, label %bb.0, label %bb.1
388
389bb.0:
390  %alloca = alloca [16 x i32], align 64, addrspace(5)
391  %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
392  store i32 0, ptr addrspace(5) %alloca
393  store i32 1, ptr addrspace(5) %gep1
394  %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
395  %load = load i32, ptr addrspace(5) %gep2
396  %tid = call i32 @llvm.amdgcn.workitem.id.x()
397  %add = add i32 %load, %tid
398  store i32 %add, ptr addrspace(1) %out
399  br label %bb.1
400
401bb.1:
402  store volatile i32 0, ptr addrspace(1) undef
403  ret void
404}
405
406declare i32 @llvm.amdgcn.workitem.id.x() #0
407
408attributes #0 = { nounwind readnone speculatable }
409attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
410
411!llvm.module.flags = !{!0}
412!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}
413;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
414; ASSUME1024: {{.*}}
415; DEFAULTSIZE: {{.*}}
416; DEFAULTSIZE-V5: {{.*}}
417