xref: /llvm-project/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefix=MUBUF %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=FLATSCR %s
4
5; Make sure we use the correct frame offset is used with the local
6; frame area.
7;
8; %pin.low is allocated to offset 0.
9;
10; %local.area is assigned to the local frame offset by the
11; LocalStackSlotAllocation pass at offset 4096.
12;
13; The %load1 access to %gep.large.offset initially used the stack
14; pointer register and directly referenced the frame index. After
15; LocalStackSlotAllocation, it would no longer refer to a frame index
16; so eliminateFrameIndex would not adjust the access to use the
17; correct FP offset.
18
19define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
20; MUBUF-LABEL: local_stack_offset_uses_sp:
21; MUBUF:       ; %bb.0: ; %entry
22; MUBUF-NEXT:    s_add_u32 s0, s0, s17
23; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x3000
24; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
25; MUBUF-NEXT:    v_add_u32_e32 v0, 64, v1
26; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
27; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x2000
28; MUBUF-NEXT:    s_mov_b32 s4, 0
29; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
30; MUBUF-NEXT:    s_waitcnt vmcnt(0)
31; MUBUF-NEXT:  .LBB0_1: ; %loadstoreloop
32; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
33; MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3000
34; MUBUF-NEXT:    v_add_u32_e32 v2, s4, v3
35; MUBUF-NEXT:    s_add_i32 s4, s4, 1
36; MUBUF-NEXT:    s_cmpk_lt_u32 s4, 0x2120
37; MUBUF-NEXT:    buffer_store_byte v1, v2, s[0:3], 0 offen
38; MUBUF-NEXT:    s_waitcnt vmcnt(0)
39; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_1
40; MUBUF-NEXT:  ; %bb.2: ; %split
41; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x50d0
42; MUBUF-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen glc
43; MUBUF-NEXT:    s_waitcnt vmcnt(0)
44; MUBUF-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc
45; MUBUF-NEXT:    s_waitcnt vmcnt(0)
46; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen glc
47; MUBUF-NEXT:    s_waitcnt vmcnt(0)
48; MUBUF-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc
49; MUBUF-NEXT:    s_waitcnt vmcnt(0)
50; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
51; MUBUF-NEXT:    v_mov_b32_e32 v6, 0
52; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v4
53; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v5, vcc
54; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
55; MUBUF-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
56; MUBUF-NEXT:    s_waitcnt vmcnt(0)
57; MUBUF-NEXT:    s_endpgm
58;
59; FLATSCR-LABEL: local_stack_offset_uses_sp:
60; FLATSCR:       ; %bb.0: ; %entry
61; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
62; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
63; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
64; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
65; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
66; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
67; FLATSCR-NEXT:    s_mov_b32 s0, 0
68; FLATSCR-NEXT:  .LBB0_1: ; %loadstoreloop
69; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
70; FLATSCR-NEXT:    s_add_i32 s1, s0, 0x3000
71; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
72; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
73; FLATSCR-NEXT:    scratch_store_byte off, v0, s1
74; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
75; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_1
76; FLATSCR-NEXT:  ; %bb.2: ; %split
77; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
78; FLATSCR-NEXT:    s_addk_i32 s0, 0x3000
79; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
80; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
81; FLATSCR-NEXT:    s_movk_i32 s0, 0x3000
82; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc
83; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
84; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
85; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0
86; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
87; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
88; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
89; FLATSCR-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
90; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
91; FLATSCR-NEXT:    s_endpgm
92entry:
93  %pin.low = alloca i32, align 8192, addrspace(5)
94  %local.area = alloca [1060 x i64], align 4096, addrspace(5)
95  store volatile i32 0, ptr addrspace(5) %pin.low
96  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
97  %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
98  %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
99  %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
100  %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
101  %add0 = add i64 %load0, %load1
102  store volatile i64 %add0, ptr addrspace(1) %out
103  ret void
104}
105
106define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
107; MUBUF-LABEL: func_local_stack_offset_uses_sp:
108; MUBUF:       ; %bb.0: ; %entry
109; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110; MUBUF-NEXT:    s_mov_b32 s5, s33
111; MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
112; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
113; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
114; MUBUF-NEXT:    v_add_u32_e32 v3, 0x3000, v3
115; MUBUF-NEXT:    s_mov_b32 s6, s34
116; MUBUF-NEXT:    v_add_u32_e32 v2, 64, v3
117; MUBUF-NEXT:    v_mov_b32_e32 v3, 0
118; MUBUF-NEXT:    v_mov_b32_e32 v4, 0x2000
119; MUBUF-NEXT:    s_mov_b32 s4, 0
120; MUBUF-NEXT:    s_mov_b32 s34, s32
121; MUBUF-NEXT:    s_add_i32 s32, s32, 0x200000
122; MUBUF-NEXT:    buffer_store_dword v3, v4, s[0:3], s33 offen
123; MUBUF-NEXT:    s_waitcnt vmcnt(0)
124; MUBUF-NEXT:  .LBB1_1: ; %loadstoreloop
125; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
126; MUBUF-NEXT:    v_lshrrev_b32_e64 v5, 6, s33
127; MUBUF-NEXT:    v_add_u32_e32 v4, s4, v5
128; MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3000
129; MUBUF-NEXT:    s_add_i32 s4, s4, 1
130; MUBUF-NEXT:    v_add_u32_e32 v4, v5, v4
131; MUBUF-NEXT:    s_cmpk_lt_u32 s4, 0x2120
132; MUBUF-NEXT:    buffer_store_byte v3, v4, s[0:3], 0 offen
133; MUBUF-NEXT:    s_waitcnt vmcnt(0)
134; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_1
135; MUBUF-NEXT:  ; %bb.2: ; %split
136; MUBUF-NEXT:    v_lshrrev_b32_e64 v4, 6, s33
137; MUBUF-NEXT:    v_add_u32_e32 v3, 0x50d0, v4
138; MUBUF-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen glc
139; MUBUF-NEXT:    s_waitcnt vmcnt(0)
140; MUBUF-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc
141; MUBUF-NEXT:    s_waitcnt vmcnt(0)
142; MUBUF-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen glc
143; MUBUF-NEXT:    s_waitcnt vmcnt(0)
144; MUBUF-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc
145; MUBUF-NEXT:    s_waitcnt vmcnt(0)
146; MUBUF-NEXT:    s_mov_b32 s32, s34
147; MUBUF-NEXT:    s_mov_b32 s34, s6
148; MUBUF-NEXT:    s_mov_b32 s33, s5
149; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
150; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
151; MUBUF-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
152; MUBUF-NEXT:    s_waitcnt vmcnt(0)
153; MUBUF-NEXT:    s_setpc_b64 s[30:31]
154;
155; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
156; FLATSCR:       ; %bb.0: ; %entry
157; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; FLATSCR-NEXT:    s_mov_b32 s2, s33
159; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
160; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
161; FLATSCR-NEXT:    s_mov_b32 s3, s34
162; FLATSCR-NEXT:    s_mov_b32 s34, s32
163; FLATSCR-NEXT:    s_add_i32 s32, s32, 0x8000
164; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
165; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x2000
166; FLATSCR-NEXT:    scratch_store_dword off, v2, s0
167; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
168; FLATSCR-NEXT:    s_mov_b32 s0, 0
169; FLATSCR-NEXT:  .LBB1_1: ; %loadstoreloop
170; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
171; FLATSCR-NEXT:    s_add_i32 s1, s33, s0
172; FLATSCR-NEXT:    s_addk_i32 s1, 0x3000
173; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
174; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
175; FLATSCR-NEXT:    scratch_store_byte off, v2, s1
176; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
177; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_1
178; FLATSCR-NEXT:  ; %bb.2: ; %split
179; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
180; FLATSCR-NEXT:    s_add_i32 s1, s33, s0
181; FLATSCR-NEXT:    s_add_i32 s0, s1, 0x3000
182; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
183; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
184; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x3000
185; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
186; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
187; FLATSCR-NEXT:    s_mov_b32 s32, s34
188; FLATSCR-NEXT:    s_mov_b32 s34, s3
189; FLATSCR-NEXT:    s_mov_b32 s33, s2
190; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
191; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
192; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
193; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
194; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
195entry:
196  %pin.low = alloca i32, align 8192, addrspace(5)
197  %local.area = alloca [1060 x i64], align 4096, addrspace(5)
198  store volatile i32 0, ptr addrspace(5) %pin.low
199  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
200  %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
201  %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
202  %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
203  %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
204  %add0 = add i64 %load0, %load1
205  store volatile i64 %add0, ptr addrspace(1) %out
206  ret void
207}
208
209define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) {
210; MUBUF-LABEL: local_stack_offset_uses_sp_flat:
211; MUBUF:       ; %bb.0: ; %entry
212; MUBUF-NEXT:    s_add_u32 s0, s0, s17
213; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
214; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
215; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x2000
216; MUBUF-NEXT:    s_mov_b32 s4, 0
217; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
218; MUBUF-NEXT:    s_waitcnt vmcnt(0)
219; MUBUF-NEXT:  .LBB2_1: ; %loadstoreloop
220; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
221; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
222; MUBUF-NEXT:    v_add_u32_e32 v1, s4, v2
223; MUBUF-NEXT:    s_add_i32 s4, s4, 1
224; MUBUF-NEXT:    s_cmpk_lt_u32 s4, 0x2120
225; MUBUF-NEXT:    buffer_store_byte v0, v1, s[0:3], 0 offen
226; MUBUF-NEXT:    s_waitcnt vmcnt(0)
227; MUBUF-NEXT:    s_cbranch_scc1 .LBB2_1
228; MUBUF-NEXT:  ; %bb.2: ; %split
229; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
230; MUBUF-NEXT:    s_movk_i32 s4, 0x12d4
231; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
232; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12c0, v1
233; MUBUF-NEXT:    v_or_b32_e32 v1, s4, v2
234; MUBUF-NEXT:    s_movk_i32 s4, 0x12d0
235; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
236; MUBUF-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen glc
237; MUBUF-NEXT:    s_waitcnt vmcnt(0)
238; MUBUF-NEXT:    v_or_b32_e32 v1, s4, v2
239; MUBUF-NEXT:    s_movk_i32 s4, 0x12c4
240; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
241; MUBUF-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen glc
242; MUBUF-NEXT:    s_waitcnt vmcnt(0)
243; MUBUF-NEXT:    v_or_b32_e32 v1, s4, v2
244; MUBUF-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen glc
245; MUBUF-NEXT:    s_waitcnt vmcnt(0)
246; MUBUF-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen glc
247; MUBUF-NEXT:    s_waitcnt vmcnt(0)
248; MUBUF-NEXT:    s_movk_i32 s4, 0x12cc
249; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
250; MUBUF-NEXT:    v_or_b32_e32 v0, s4, v1
251; MUBUF-NEXT:    s_movk_i32 s4, 0x12c8
252; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
253; MUBUF-NEXT:    v_or_b32_e32 v1, s4, v2
254; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
255; MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
256; MUBUF-NEXT:    s_waitcnt vmcnt(0)
257; MUBUF-NEXT:    v_mov_b32_e32 v3, 0x4000
258; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
259; MUBUF-NEXT:    s_waitcnt vmcnt(0)
260; MUBUF-NEXT:    v_mov_b32_e32 v10, 0x4000
261; MUBUF-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen glc
262; MUBUF-NEXT:    s_waitcnt vmcnt(0)
263; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
264; MUBUF-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:4 glc
265; MUBUF-NEXT:    s_waitcnt vmcnt(0)
266; MUBUF-NEXT:    v_mov_b32_e32 v11, 0x4000
267; MUBUF-NEXT:    buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc
268; MUBUF-NEXT:    s_waitcnt vmcnt(0)
269; MUBUF-NEXT:    v_mov_b32_e32 v12, 0x4000
270; MUBUF-NEXT:    buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc
271; MUBUF-NEXT:    s_waitcnt vmcnt(0)
272; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
273; MUBUF-NEXT:    buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc
274; MUBUF-NEXT:    s_waitcnt vmcnt(0)
275; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v1, v2
276; MUBUF-NEXT:    buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc
277; MUBUF-NEXT:    s_waitcnt vmcnt(0)
278; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
279; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v8
280; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v9, vcc
281; MUBUF-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
282; MUBUF-NEXT:    v_mov_b32_e32 v12, 0
283; MUBUF-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v11, vcc
284; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
285; MUBUF-NEXT:    global_store_dwordx2 v12, v[4:5], s[4:5] offset:16
286; MUBUF-NEXT:    s_waitcnt vmcnt(0)
287; MUBUF-NEXT:    global_store_dwordx4 v12, v[0:3], s[4:5]
288; MUBUF-NEXT:    s_waitcnt vmcnt(0)
289; MUBUF-NEXT:    s_endpgm
290;
291; FLATSCR-LABEL: local_stack_offset_uses_sp_flat:
292; FLATSCR:       ; %bb.0: ; %entry
293; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
294; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
295; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
296; FLATSCR-NEXT:    s_mov_b32 s0, 0
297; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:1024
298; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
299; FLATSCR-NEXT:  .LBB2_1: ; %loadstoreloop
300; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
301; FLATSCR-NEXT:    s_add_i32 s1, s0, 0x2000
302; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
303; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
304; FLATSCR-NEXT:    scratch_store_byte off, v0, s1
305; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
306; FLATSCR-NEXT:    s_cbranch_scc1 .LBB2_1
307; FLATSCR-NEXT:  ; %bb.2: ; %split
308; FLATSCR-NEXT:    s_movk_i32 s0, 0x1000
309; FLATSCR-NEXT:    s_addk_i32 s0, 0x2000
310; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
311; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
312; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
313; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
314; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
315; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc
316; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
317; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s0 glc
318; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
319; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
320; FLATSCR-NEXT:    v_mov_b32_e32 v12, 0
321; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
322; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
323; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
324; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
325; FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v10
326; FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v11, vcc
327; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
328; FLATSCR-NEXT:    global_store_dwordx2 v12, v[4:5], s[0:1] offset:16
329; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
330; FLATSCR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
331; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
332; FLATSCR-NEXT:    s_endpgm
333entry:
334  %pin.low = alloca i32, align 1024, addrspace(5)
335  %local.area = alloca [160 x <3 x i64>], align 8192, addrspace(5)
336  store volatile i32 0, ptr addrspace(5) %pin.low
337  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
338  %gep.large.offset = getelementptr inbounds [160 x <3 x i64>], ptr addrspace(5) %local.area, i64 0, i64 150
339  %load0 = load volatile <3 x i64>, ptr addrspace(5) %gep.large.offset
340  %load1 = load volatile <3 x i64>, ptr addrspace(5) %local.area
341  %add0 = add <3 x i64> %load0, %load1
342  store volatile <3 x i64> %add0, ptr addrspace(1) %out
343  ret void
344}
345
346declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
347
348attributes #0 = { argmemonly nounwind willreturn writeonly }
349